diff --git a/exercises/01_penguin_classification.ipynb b/exercises/01_penguin_classification.ipynb index 95eb953..06f1683 100644 --- a/exercises/01_penguin_classification.ipynb +++ b/exercises/01_penguin_classification.ipynb @@ -32,11 +32,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 75, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['Adelie', 'Chinstrap', 'Gentoo']" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from palmerpenguins import load_penguins" + "from palmerpenguins import load_penguins\n", + "\n", + "data = load_penguins()\n", + "\n", + "data\n", + "\n", + "target_names = sorted(data.species.unique())\n", + "\n", + "target_names\n" ] }, { @@ -55,9 +74,13 @@ "source": [ "### Task 2: creating a ``torch.utils.data.Dataset``\n", "\n", + "The penguin data reading and processing can be encapsulated in a PyTorch dataset class.\n", + "\n", + "- This is helpful because...\n", + "\n", "All PyTorch dataset objects are subclasses of the ``torch.utils.data.Dataset`` class. To make a custom dataset, create a class which inherits from the ``Dataset`` class, implement some methods (the Python magic (or dunder) methods ``__len__`` and ``__getitem__``) and supply some data.\n", "\n", - "Spoiler alert: we've done this for you already in ``src/ml_workshop/_penguins.py``.\n", + "Spoiler alert: we've done this for you already below (see ``src/ml_workshop/_penguins.py`` for a more sophisticated implementation)\n", "\n", "- Open the file ``src/ml_workshop/_penguins.py``.\n", "- Let's examine, and discuss, each of the methods together.\n", @@ -75,6 +98,78 @@ " - ``y_tfms``— ..." ] }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional, List, Dict, Tuple, Any\n", + "\n", + "# import pytorch functions necessary for transformations:\n", + "from torch import tensor, float32, eye\n", + "\n", + "from torch.utils.data import Dataset\n", + "from torchvision.transforms import Compose\n", + "\n", + "from pandas import DataFrame\n", + "\n", + "from palmerpenguins import load_penguins\n", + "\n", + "\n", + "class PenguinDataset(Dataset):\n", + " def __init__(\n", + " self,\n", + " input_keys: List[str],\n", + " target_keys: List[str],\n", + " train: bool,\n", + " ):\n", + " \"\"\"Build ``PenguinDataset``.\"\"\"\n", + " self.input_keys = input_keys\n", + " self.target_keys = target_keys\n", + "\n", + " data = load_penguins()\n", + " data = (\n", + " data.loc[~data.isna().any(axis=1)]\n", + " .sort_values(by=sorted(data.keys()))\n", + " .reset_index(drop=True)\n", + " )\n", + " # Transform the sex field into a float, with male represented by 1.0, female by 0.0\n", + " data.sex = (data.sex == \"male\").astype(float)\n", + " self.full_df = data\n", + "\n", + " valid_df = self.full_df.groupby(by=[\"species\", \"sex\"]).sample(\n", + " n=10,\n", + " random_state=123,\n", + " )\n", + " # The training items are simply the items *not* in the valid split\n", + " train_df = self.full_df.loc[~self.full_df.index.isin(valid_df.index)]\n", + "\n", + " self.split = {\"train\": train_df, \"valid\": valid_df}[\"train\" if train is True else \"valid\"]\n", + "\n", + "\n", + " def __len__(self) -> int:\n", + " return len(self.split)\n", + " \n", + " def __getitem__(self, idx: int) -> Tuple[Any, Any]:\n", + " # get the row index (idx) from the dataframe and \n", + " # select relevant column features (provided as input_keys)\n", + " feats = self.split.iloc[idx][self.input_keys]\n", + "\n", + " # this gives a 'species' i.e. one of ('Gentoo',), ('Chinstrap',), or ('Adelie',) \n", + " tgts = self.split.iloc[idx][self.target_keys]\n", + "\n", + " # Exercise #1: convert the feats to PyTorch\n", + " feats = tensor(feats.values, dtype=float32)\n", + "\n", + " # Exercise #2: convert this to a 'one-hot vector' \n", + " target_names = sorted(self.full_df.species.unique())\n", + " \n", + " tgts = eye(len(target_names))[target_names.index(tgts.values[0])]\n", + " \n", + " return (feats, tgts)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -93,22 +188,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 109, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor([ 42.9000, 5000.0000]), tensor([0., 0., 1.]))" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from ml_workshop import PenguinDataset\n", + "# from ml_workshop import PenguinDataset\n", "\n", - "data_set = PenguinDataset(\n", + "data_set_1 = PenguinDataset(\n", " input_keys=[\"bill_length_mm\", \"body_mass_g\"],\n", " target_keys=[\"species\"],\n", " train=True,\n", ")\n", "\n", "\n", - "for features, target in data_set:\n", - " # print the features and targets here\n", - " pass" + "# for features, target in data_set:\n", + "# # print the features and targets here\n", + "# print(features, target)\n", + "\n", + "\n", + "data_set_1[0]" ] }, { @@ -417,7 +526,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.4" } }, "nbformat": 4,