OSGeoLive-Notebooks/Pandas/solved - 03 - Indexing and ...

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Indexing and selecting data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "try:\n",
    "    import seaborn\n",
    "except ImportError:\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>country</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>30510</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>Belgium</td>\n",
       "      <td>11.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>France</td>\n",
       "      <td>64.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>Germany</td>\n",
       "      <td>81.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>Netherlands</td>\n",
       "      <td>16.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>244820</td>\n",
       "      <td>London</td>\n",
       "      <td>United Kingdom</td>\n",
       "      <td>64.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     area    capital         country  population\n",
       "0   30510   Brussels         Belgium        11.3\n",
       "1  671308      Paris          France        64.3\n",
       "2  357050     Berlin         Germany        81.3\n",
       "3   41526  Amsterdam     Netherlands        16.9\n",
       "4  244820     London  United Kingdom        64.9"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# redefining the example objects\n",
    "\n",
    "# series\n",
    "population = pd.Series({'Germany': 81.3, 'Belgium': 11.3, 'France': 64.3, \n",
    "                        'United Kingdom': 64.9, 'Netherlands': 16.9})\n",
    "\n",
    "# dataframe\n",
    "data = {'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'],\n",
    "        'population': [11.3, 64.3, 81.3, 16.9, 64.9],\n",
    "        'area': [30510, 671308, 357050, 41526, 244820],\n",
    "        'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']}\n",
    "countries = pd.DataFrame(data)\n",
    "countries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Setting the index to the country names:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>30510</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>11.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>64.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>London</td>\n",
       "      <td>64.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population\n",
       "country                                      \n",
       "Belgium          30510   Brussels        11.3\n",
       "France          671308      Paris        64.3\n",
       "Germany         357050     Berlin        81.3\n",
       "Netherlands      41526  Amsterdam        16.9\n",
       "United Kingdom  244820     London        64.9"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries = countries.set_index('country')\n",
    "countries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Some notes on selecting data\n",
    "\n",
    "One of pandas' basic features is the labeling of rows and columns, but this makes indexing also a bit more complex compared to numpy. We now have to distuinguish between:\n",
    "\n",
    "- selection by label\n",
    "- selection by position."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### `data[]` provides some convenience shortcuts "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For a DataFrame, basic indexing selects the columns.\n",
    "\n",
    "Selecting a single column:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country\n",
       "Belgium            30510\n",
       "France            671308\n",
       "Germany           357050\n",
       "Netherlands        41526\n",
       "United Kingdom    244820\n",
       "Name: area, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries['area']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "or multiple columns:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>30510</td>\n",
       "      <td>11.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>64.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>81.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>16.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>64.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area  population\n",
       "country                           \n",
       "Belgium          30510        11.3\n",
       "France          671308        64.3\n",
       "Germany         357050        81.3\n",
       "Netherlands      41526        16.9\n",
       "United Kingdom  244820        64.9"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries[['area', 'population']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "But, slicing accesses the rows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>64.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               area    capital  population\n",
       "country                                   \n",
       "France       671308      Paris        64.3\n",
       "Germany      357050     Berlin        81.3\n",
       "Netherlands   41526  Amsterdam        16.9"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries['France':'Netherlands']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-danger\">\n",
    "    <b>NOTE</b>: Unlike slicing in numpy, the end label is **included**.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So as a summary, `[]` provides the following convenience shortcuts:\n",
    "\n",
    "- Series: selecting a label: `s[label]`\n",
    "- DataFrame: selecting a single or multiple columns: `df['col']` or `df[['col1', 'col2']]`\n",
    "- DataFrame: slicing the rows: `df['row_label1':'row_label2']` or `df[mask]`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Systematic indexing with `loc` and `iloc`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When using `[]` like above, you can only select from one axis at once (rows or columns, not both). For more advanced indexing, you have some extra attributes:\n",
    "    \n",
    "* `loc`: selection by label\n",
    "* `iloc`: selection by position\n",
    "\n",
    "These methods index the different dimensions of the frame:\n",
    "\n",
    "* `df.loc[row_indexer, column_indexer]`\n",
    "* `df.iloc[row_indexer, column_indexer]`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Selecting a single element:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "357050"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries.loc['Germany', 'area']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "But the row or column indexer can also be a list, slice, boolean array, .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>64.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>81.3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           area  population\n",
       "country                    \n",
       "France   671308        64.3\n",
       "Germany  357050        81.3"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries.loc['France':'Germany', ['area', 'population']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "Selecting by position with `iloc` works similar as indexing numpy arrays:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>Brussels</td>\n",
       "      <td>11.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>Paris</td>\n",
       "      <td>64.3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          capital  population\n",
       "country                      \n",
       "Belgium  Brussels        11.3\n",
       "France      Paris        64.3"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries.iloc[0:2,1:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The different indexing methods can also be used to assign data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "countries2 = countries.copy()\n",
    "countries2.loc['Belgium':'Germany', 'population'] = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>30510</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>London</td>\n",
       "      <td>64.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population\n",
       "country                                      \n",
       "Belgium          30510   Brussels        10.0\n",
       "France          671308      Paris        10.0\n",
       "Germany         357050     Berlin        10.0\n",
       "Netherlands      41526  Amsterdam        16.9\n",
       "United Kingdom  244820     London        64.9"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Boolean indexing (filtering)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Often, you want to select rows based on a certain condition. This can be done with 'boolean indexing' (like a where clause in SQL). \n",
    "\n",
    "The indexer (or boolean mask) should be 1-dimensional and the same length as the thing being indexed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country\n",
       "Belgium           False\n",
       "France             True\n",
       "Germany            True\n",
       "Netherlands       False\n",
       "United Kingdom     True\n",
       "Name: area, dtype: bool"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries['area'] > 100000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>64.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>London</td>\n",
       "      <td>64.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area capital  population\n",
       "country                                   \n",
       "France          671308   Paris        64.3\n",
       "Germany         357050  Berlin        81.3\n",
       "United Kingdom  244820  London        64.9"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries[countries['area'] > 100000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "\n",
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: Add a column `density` with the population density (note: population column is expressed in millions)\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "      <th>density</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>30510</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>11.3</td>\n",
       "      <td>370.370370</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>64.3</td>\n",
       "      <td>95.783158</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "      <td>227.699202</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "      <td>406.973944</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>London</td>\n",
       "      <td>64.9</td>\n",
       "      <td>265.092721</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population     density\n",
       "country                                                  \n",
       "Belgium          30510   Brussels        11.3  370.370370\n",
       "France          671308      Paris        64.3   95.783158\n",
       "Germany         357050     Berlin        81.3  227.699202\n",
       "Netherlands      41526  Amsterdam        16.9  406.973944\n",
       "United Kingdom  244820     London        64.9  265.092721"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries['density'] = countries['population']*1000000 / countries['area']\n",
    "countries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: Select the capital and the population column of those countries where the density is larger than 300\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>Brussels</td>\n",
       "      <td>11.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               capital  population\n",
       "country                           \n",
       "Belgium       Brussels        11.3\n",
       "Netherlands  Amsterdam        16.9"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries.loc[countries['density'] > 300, ['capital', 'population']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: Add a column 'density_ratio' with the ratio of the density to the mean density\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "      <th>density</th>\n",
       "      <th>density_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>30510</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>11.3</td>\n",
       "      <td>370.370370</td>\n",
       "      <td>1.355755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>64.3</td>\n",
       "      <td>95.783158</td>\n",
       "      <td>0.350618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "      <td>227.699202</td>\n",
       "      <td>0.833502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "      <td>406.973944</td>\n",
       "      <td>1.489744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>London</td>\n",
       "      <td>64.9</td>\n",
       "      <td>265.092721</td>\n",
       "      <td>0.970382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population     density  density_ratio\n",
       "country                                                                 \n",
       "Belgium          30510   Brussels        11.3  370.370370       1.355755\n",
       "France          671308      Paris        64.3   95.783158       0.350618\n",
       "Germany         357050     Berlin        81.3  227.699202       0.833502\n",
       "Netherlands      41526  Amsterdam        16.9  406.973944       1.489744\n",
       "United Kingdom  244820     London        64.9  265.092721       0.970382"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries['density_ratio'] = countries['density'] / countries['density'].mean()\n",
    "countries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: Change the capital of the UK to Cambridge\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "      <th>density</th>\n",
       "      <th>density_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>30510</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>11.3</td>\n",
       "      <td>370.370370</td>\n",
       "      <td>1.355755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>64.3</td>\n",
       "      <td>95.783158</td>\n",
       "      <td>0.350618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "      <td>227.699202</td>\n",
       "      <td>0.833502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "      <td>406.973944</td>\n",
       "      <td>1.489744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>Cambridge</td>\n",
       "      <td>64.9</td>\n",
       "      <td>265.092721</td>\n",
       "      <td>0.970382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population     density  density_ratio\n",
       "country                                                                 \n",
       "Belgium          30510   Brussels        11.3  370.370370       1.355755\n",
       "France          671308      Paris        64.3   95.783158       0.350618\n",
       "Germany         357050     Berlin        81.3  227.699202       0.833502\n",
       "Netherlands      41526  Amsterdam        16.9  406.973944       1.489744\n",
       "United Kingdom  244820  Cambridge        64.9  265.092721       0.970382"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries.loc['United Kingdom', 'capital'] = 'Cambridge'\n",
    "countries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: Select all countries whose population density is between 100 and 300 people/km²\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "      <th>density</th>\n",
       "      <th>density_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "      <td>227.699202</td>\n",
       "      <td>0.833502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>Cambridge</td>\n",
       "      <td>64.9</td>\n",
       "      <td>265.092721</td>\n",
       "      <td>0.970382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population     density  density_ratio\n",
       "country                                                                 \n",
       "Germany         357050     Berlin        81.3  227.699202       0.833502\n",
       "United Kingdom  244820  Cambridge        64.9  265.092721       0.970382"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries[(countries['density'] > 100) & (countries['density'] < 300)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Some other useful methods: `isin` and string methods"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `isin` method of Series is very useful to select rows that may contain certain values:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "s = countries['capital']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "s.isin?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country\n",
       "Belgium           False\n",
       "France            False\n",
       "Germany            True\n",
       "Netherlands       False\n",
       "United Kingdom    False\n",
       "Name: capital, dtype: bool"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.isin(['Berlin', 'London'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This can then be used to filter the dataframe with boolean indexing:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "      <th>density</th>\n",
       "      <th>density_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "      <td>227.699202</td>\n",
       "      <td>0.833502</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           area capital  population     density  density_ratio\n",
       "country                                                       \n",
       "Germany  357050  Berlin        81.3  227.699202       0.833502"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries[countries['capital'].isin(['Berlin', 'London'])]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's say we want to select all data for which the capital starts with a 'B'. In Python, when having a string, we could use the `startswith` method:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'Berlin'.startswith('B')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In pandas, these are available on a Series through the `str` namespace:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country\n",
       "Belgium            True\n",
       "France            False\n",
       "Germany            True\n",
       "Netherlands       False\n",
       "United Kingdom    False\n",
       "Name: capital, dtype: bool"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries['capital'].str.startswith('B')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For an overview of all string methods, see: http://pandas.pydata.org/pandas-docs/stable/api.html#string-handling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: Select all countries that have capital names with more than 7 characters\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "      <th>density</th>\n",
       "      <th>density_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>30510</td>\n",
       "      <td>Brussels</td>\n",
       "      <td>11.3</td>\n",
       "      <td>370.370370</td>\n",
       "      <td>1.355755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "      <td>406.973944</td>\n",
       "      <td>1.489744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>Cambridge</td>\n",
       "      <td>64.9</td>\n",
       "      <td>265.092721</td>\n",
       "      <td>0.970382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population     density  density_ratio\n",
       "country                                                                 \n",
       "Belgium          30510   Brussels        11.3  370.370370       1.355755\n",
       "Netherlands      41526  Amsterdam        16.9  406.973944       1.489744\n",
       "United Kingdom  244820  Cambridge        64.9  265.092721       0.970382"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries[countries['capital'].str.len() > 7]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: Select all countries that have capital names that contain the character sequence 'am'\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "      <th>density</th>\n",
       "      <th>density_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "      <td>406.973944</td>\n",
       "      <td>1.489744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>Cambridge</td>\n",
       "      <td>64.9</td>\n",
       "      <td>265.092721</td>\n",
       "      <td>0.970382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population     density  density_ratio\n",
       "country                                                                 \n",
       "Netherlands      41526  Amsterdam        16.9  406.973944       1.489744\n",
       "United Kingdom  244820  Cambridge        64.9  265.092721       0.970382"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries[countries['capital'].str.contains('am')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pitfall: chained indexing (and the 'SettingWithCopyWarning')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "countries.loc['Belgium', 'capital'] = 'Ghent' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "      <th>density</th>\n",
       "      <th>density_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>30510</td>\n",
       "      <td>Ghent</td>\n",
       "      <td>11.3</td>\n",
       "      <td>370.370370</td>\n",
       "      <td>1.355755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>64.3</td>\n",
       "      <td>95.783158</td>\n",
       "      <td>0.350618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "      <td>227.699202</td>\n",
       "      <td>0.833502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "      <td>406.973944</td>\n",
       "      <td>1.489744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>Cambridge</td>\n",
       "      <td>64.9</td>\n",
       "      <td>265.092721</td>\n",
       "      <td>0.970382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population     density  density_ratio\n",
       "country                                                                 \n",
       "Belgium          30510      Ghent        11.3  370.370370       1.355755\n",
       "France          671308      Paris        64.3   95.783158       0.350618\n",
       "Germany         357050     Berlin        81.3  227.699202       0.833502\n",
       "Netherlands      41526  Amsterdam        16.9  406.973944       1.489744\n",
       "United Kingdom  244820  Cambridge        64.9  265.092721       0.970382"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/joris/miniconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  if __name__ == '__main__':\n"
     ]
    }
   ],
   "source": [
    "countries['capital']['Belgium'] = 'Antwerp' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "      <th>density</th>\n",
       "      <th>density_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>30510</td>\n",
       "      <td>Antwerp</td>\n",
       "      <td>11.3</td>\n",
       "      <td>370.370370</td>\n",
       "      <td>1.355755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>64.3</td>\n",
       "      <td>95.783158</td>\n",
       "      <td>0.350618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "      <td>227.699202</td>\n",
       "      <td>0.833502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "      <td>406.973944</td>\n",
       "      <td>1.489744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>Cambridge</td>\n",
       "      <td>64.9</td>\n",
       "      <td>265.092721</td>\n",
       "      <td>0.970382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population     density  density_ratio\n",
       "country                                                                 \n",
       "Belgium          30510    Antwerp        11.3  370.370370       1.355755\n",
       "France          671308      Paris        64.3   95.783158       0.350618\n",
       "Germany         357050     Berlin        81.3  227.699202       0.833502\n",
       "Netherlands      41526  Amsterdam        16.9  406.973944       1.489744\n",
       "United Kingdom  244820  Cambridge        64.9  265.092721       0.970382"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/joris/miniconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  if __name__ == '__main__':\n"
     ]
    }
   ],
   "source": [
    "countries[countries['capital'] == 'Antwerp']['capital'] = 'Brussels' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>capital</th>\n",
       "      <th>population</th>\n",
       "      <th>density</th>\n",
       "      <th>density_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Belgium</th>\n",
       "      <td>30510</td>\n",
       "      <td>Antwerp</td>\n",
       "      <td>11.3</td>\n",
       "      <td>370.370370</td>\n",
       "      <td>1.355755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>France</th>\n",
       "      <td>671308</td>\n",
       "      <td>Paris</td>\n",
       "      <td>64.3</td>\n",
       "      <td>95.783158</td>\n",
       "      <td>0.350618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Germany</th>\n",
       "      <td>357050</td>\n",
       "      <td>Berlin</td>\n",
       "      <td>81.3</td>\n",
       "      <td>227.699202</td>\n",
       "      <td>0.833502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Netherlands</th>\n",
       "      <td>41526</td>\n",
       "      <td>Amsterdam</td>\n",
       "      <td>16.9</td>\n",
       "      <td>406.973944</td>\n",
       "      <td>1.489744</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>United Kingdom</th>\n",
       "      <td>244820</td>\n",
       "      <td>Cambridge</td>\n",
       "      <td>64.9</td>\n",
       "      <td>265.092721</td>\n",
       "      <td>0.970382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  area    capital  population     density  density_ratio\n",
       "country                                                                 \n",
       "Belgium          30510    Antwerp        11.3  370.370370       1.355755\n",
       "France          671308      Paris        64.3   95.783158       0.350618\n",
       "Germany         357050     Berlin        81.3  227.699202       0.833502\n",
       "Netherlands      41526  Amsterdam        16.9  406.973944       1.489744\n",
       "United Kingdom  244820  Cambridge        64.9  265.092721       0.970382"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How to avoid this?\n",
    "\n",
    "* Use `loc` instead of chained indexing if possible!\n",
    "* Or `copy` explicitly if you don't want to change the original data."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## More exercises!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the quick ones among you, here are some more exercises with some larger dataframe with film data. These exercises are based on the [PyCon tutorial of Brandon Rhodes](https://github.com/brandon-rhodes/pycon-pandas-tutorial/) (so all credit to him!) and the datasets he prepared for that. You can download these data from here: [`titles.csv`](https://drive.google.com/open?id=0B3G70MlBnCgKajNMa1pfSzN6Q3M) and [`cast.csv`](https://drive.google.com/open?id=0B3G70MlBnCgKal9UYTJSR2ZhSW8) and put them in the `/data` folder."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>year</th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "      <th>character</th>\n",
       "      <th>n</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Suuri illusioni</td>\n",
       "      <td>1985</td>\n",
       "      <td>Homo $</td>\n",
       "      <td>actor</td>\n",
       "      <td>Guests</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Gangsta Rap: The Glockumentary</td>\n",
       "      <td>2007</td>\n",
       "      <td>Too $hort</td>\n",
       "      <td>actor</td>\n",
       "      <td>Himself</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Menace II Society</td>\n",
       "      <td>1993</td>\n",
       "      <td>Too $hort</td>\n",
       "      <td>actor</td>\n",
       "      <td>Lew-Loc</td>\n",
       "      <td>27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Porndogs: The Adventures of Sadie</td>\n",
       "      <td>2009</td>\n",
       "      <td>Too $hort</td>\n",
       "      <td>actor</td>\n",
       "      <td>Bosco</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Stop Pepper Palmer</td>\n",
       "      <td>2014</td>\n",
       "      <td>Too $hort</td>\n",
       "      <td>actor</td>\n",
       "      <td>Himself</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               title  year       name   type character   n\n",
       "0                    Suuri illusioni  1985     Homo $  actor    Guests  22\n",
       "1     Gangsta Rap: The Glockumentary  2007  Too $hort  actor   Himself NaN\n",
       "2                  Menace II Society  1993  Too $hort  actor   Lew-Loc  27\n",
       "3  Porndogs: The Adventures of Sadie  2009  Too $hort  actor     Bosco   3\n",
       "4                 Stop Pepper Palmer  2014  Too $hort  actor   Himself NaN"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cast = pd.read_csv('data/cast.csv')\n",
    "cast.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The Rising Son</td>\n",
       "      <td>1990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Ashes of Kukulcan</td>\n",
       "      <td>2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The Thousand Plane Raid</td>\n",
       "      <td>1969</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Crucea de piatra</td>\n",
       "      <td>1993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The 86</td>\n",
       "      <td>2015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     title  year\n",
       "0           The Rising Son  1990\n",
       "1        Ashes of Kukulcan  2016\n",
       "2  The Thousand Plane Raid  1969\n",
       "3         Crucea de piatra  1993\n",
       "4                   The 86  2015"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titles = pd.read_csv('data/titles.csv')\n",
    "titles.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: How many movies are listed in the titles dataframe?\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "215981"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(titles)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: What are the earliest two films listed in the titles dataframe?\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>165182</th>\n",
       "      <td>Miss Jerry</td>\n",
       "      <td>1894</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85708</th>\n",
       "      <td>Reproduction of the Corbett and Fitzsimmons Fight</td>\n",
       "      <td>1897</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    title  year\n",
       "165182                                         Miss Jerry  1894\n",
       "85708   Reproduction of the Corbett and Fitzsimmons Fight  1897"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titles.sort('year').head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: How many movies have the title \"Hamlet\"?\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(titles[titles.title == 'Hamlet'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: List all of the \"Treasure Island\" movies from earliest to most recent.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>191379</th>\n",
       "      <td>Treasure Island</td>\n",
       "      <td>1918</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47769</th>\n",
       "      <td>Treasure Island</td>\n",
       "      <td>1920</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>192917</th>\n",
       "      <td>Treasure Island</td>\n",
       "      <td>1934</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>90175</th>\n",
       "      <td>Treasure Island</td>\n",
       "      <td>1950</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104714</th>\n",
       "      <td>Treasure Island</td>\n",
       "      <td>1972</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103646</th>\n",
       "      <td>Treasure Island</td>\n",
       "      <td>1973</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>190792</th>\n",
       "      <td>Treasure Island</td>\n",
       "      <td>1985</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166675</th>\n",
       "      <td>Treasure Island</td>\n",
       "      <td>1999</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  title  year\n",
       "191379  Treasure Island  1918\n",
       "47769   Treasure Island  1920\n",
       "192917  Treasure Island  1934\n",
       "90175   Treasure Island  1950\n",
       "104714  Treasure Island  1972\n",
       "103646  Treasure Island  1973\n",
       "190792  Treasure Island  1985\n",
       "166675  Treasure Island  1999"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titles[titles.title == 'Treasure Island'].sort('year')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: How many movies were made from 1950 through 1959?\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12120"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t = titles\n",
    "len(t[(t.year >= 1950) & (t.year <= 1959)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12120"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(t[t.year // 10 == 195])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: How many roles in the movie \"Inception\" are NOT ranked by an \"n\" value?\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "22"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = cast\n",
    "c = c[c.title == 'Inception']\n",
    "c = c[c.n.isnull()]\n",
    "len(c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: But how many roles in the movie \"Inception\" did receive an \"n\" value?\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "51"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = cast\n",
    "c = c[c.title == 'Inception']\n",
    "c = c[c.n.notnull()]\n",
    "len(c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: Display the cast of \"North by Northwest\" in their correct \"n\"-value order, ignoring roles that did not earn a numeric \"n\" value.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>year</th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "      <th>character</th>\n",
       "      <th>n</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>778278</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Cary Grant</td>\n",
       "      <td>actor</td>\n",
       "      <td>Roger O. Thornhill</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3104905</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Eva Marie Saint</td>\n",
       "      <td>actress</td>\n",
       "      <td>Eve Kendall</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1300582</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>James Mason</td>\n",
       "      <td>actor</td>\n",
       "      <td>Phillip Vandamm</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2794420</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Jessie Royce Landis</td>\n",
       "      <td>actress</td>\n",
       "      <td>Clara Thornhill</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>317310</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Leo G. Carroll</td>\n",
       "      <td>actor</td>\n",
       "      <td>The Professor</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2702399</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Josephine Hutchinson</td>\n",
       "      <td>actress</td>\n",
       "      <td>Mrs. Townsend</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1513974</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Philip Ober</td>\n",
       "      <td>actor</td>\n",
       "      <td>Lester Townsend</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1137555</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Martin Landau</td>\n",
       "      <td>actor</td>\n",
       "      <td>Leonard</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2181363</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Adam Williams</td>\n",
       "      <td>actor</td>\n",
       "      <td>Valerian</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1616877</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Edward Platt</td>\n",
       "      <td>actor</td>\n",
       "      <td>Victor Larrabee</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>594433</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Robert Ellenstein</td>\n",
       "      <td>actor</td>\n",
       "      <td>Licht</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2046317</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Les Tremayne</td>\n",
       "      <td>actor</td>\n",
       "      <td>Auctioneer</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>413825</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Philip Coolidge</td>\n",
       "      <td>actor</td>\n",
       "      <td>Dr. Cross</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1347170</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Patrick McVey</td>\n",
       "      <td>actor</td>\n",
       "      <td>Sergeant Flamm</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182129</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Edward Binns</td>\n",
       "      <td>actor</td>\n",
       "      <td>Captain Junket</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1235786</th>\n",
       "      <td>North by Northwest</td>\n",
       "      <td>1959</td>\n",
       "      <td>Ken Lynch</td>\n",
       "      <td>actor</td>\n",
       "      <td>Charley - Chicago Policeman</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      title  year                  name     type  \\\n",
       "778278   North by Northwest  1959            Cary Grant    actor   \n",
       "3104905  North by Northwest  1959       Eva Marie Saint  actress   \n",
       "1300582  North by Northwest  1959           James Mason    actor   \n",
       "2794420  North by Northwest  1959   Jessie Royce Landis  actress   \n",
       "317310   North by Northwest  1959        Leo G. Carroll    actor   \n",
       "2702399  North by Northwest  1959  Josephine Hutchinson  actress   \n",
       "1513974  North by Northwest  1959           Philip Ober    actor   \n",
       "1137555  North by Northwest  1959         Martin Landau    actor   \n",
       "2181363  North by Northwest  1959         Adam Williams    actor   \n",
       "1616877  North by Northwest  1959          Edward Platt    actor   \n",
       "594433   North by Northwest  1959     Robert Ellenstein    actor   \n",
       "2046317  North by Northwest  1959          Les Tremayne    actor   \n",
       "413825   North by Northwest  1959       Philip Coolidge    actor   \n",
       "1347170  North by Northwest  1959         Patrick McVey    actor   \n",
       "182129   North by Northwest  1959          Edward Binns    actor   \n",
       "1235786  North by Northwest  1959             Ken Lynch    actor   \n",
       "\n",
       "                           character   n  \n",
       "778278            Roger O. Thornhill   1  \n",
       "3104905                  Eve Kendall   2  \n",
       "1300582              Phillip Vandamm   3  \n",
       "2794420              Clara Thornhill   4  \n",
       "317310                 The Professor   5  \n",
       "2702399                Mrs. Townsend   6  \n",
       "1513974              Lester Townsend   7  \n",
       "1137555                      Leonard   8  \n",
       "2181363                     Valerian   9  \n",
       "1616877              Victor Larrabee  10  \n",
       "594433                         Licht  11  \n",
       "2046317                   Auctioneer  12  \n",
       "413825                     Dr. Cross  13  \n",
       "1347170               Sergeant Flamm  14  \n",
       "182129                Captain Junket  15  \n",
       "1235786  Charley - Chicago Policeman  16  "
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = cast\n",
    "c = c[c.title == 'North by Northwest']\n",
    "c = c[c.n.notnull()]\n",
    "c = c.sort('n')\n",
    "c"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: How many roles were credited in the silent 1921 version of Hamlet?\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = cast\n",
    "c = c[(c.title == 'Hamlet') & (c.year == 1921)]\n",
    "len(c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-success\">\n",
    "    <b>EXERCISE</b>: List the supporting roles (having n=2) played by Cary Grant in the 1940s, in order by year.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "clear_cell": true,
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>year</th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "      <th>character</th>\n",
       "      <th>n</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>778275</th>\n",
       "      <td>My Favorite Wife</td>\n",
       "      <td>1940</td>\n",
       "      <td>Cary Grant</td>\n",
       "      <td>actor</td>\n",
       "      <td>Nick</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>778285</th>\n",
       "      <td>Penny Serenade</td>\n",
       "      <td>1941</td>\n",
       "      <td>Cary Grant</td>\n",
       "      <td>actor</td>\n",
       "      <td>Roger Adams</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   title  year        name   type    character  n\n",
       "778275  My Favorite Wife  1940  Cary Grant  actor         Nick  2\n",
       "778285    Penny Serenade  1941  Cary Grant  actor  Roger Adams  2"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = cast\n",
    "c = c[c.name == 'Cary Grant']\n",
    "c = c[c.year // 10 == 194]\n",
    "c = c[c.n == 2]\n",
    "c = c.sort('year')\n",
    "c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "celltoolbar": "Nbtutor - export exercises",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}