{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4e9aefe7",
   "metadata": {},
   "source": [
    "## The input data validation stage"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "657d13ab",
   "metadata": {},
   "source": [
    "In this section, the input validation steps are presented through an example."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "274afaef",
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bfda4368",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from eensight.config import ConfigLoader\n",
    "from eensight.methods.preprocessing.validation import (\n",
    "    check_column_exists, \n",
    "    check_column_type_datetime, \n",
    "    check_column_values_increasing,\n",
    "    check_column_values_unique,\n",
    "    remove_duplicate_dates,\n",
    "    validate_dataset,\n",
    ")\n",
    "from eensight.settings import PROJECT_PATH\n",
    "from eensight.utils import load_catalog"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28b01073",
   "metadata": {},
   "source": [
    "### Load dataset\n",
    "\n",
    "First, we load the catalog for one of the available datasets (the one with `site_id=\"b03\"`):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3ec999db",
   "metadata": {},
   "outputs": [],
   "source": [
    "catalog = load_catalog(store_uri=\"../../../data\", site_id=\"b03\", namespace=\"train\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "78259b1a",
   "metadata": {},
   "source": [
    "Get the raw input data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6a4911f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "features = catalog.load(\"train.input-features\")\n",
    "labels = catalog.load(\"train.input-labels\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab33d284",
   "metadata": {},
   "source": [
    "### Validate labels\n",
    "\n",
    "Let's work first with the label data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c97e89f5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">&lt;</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">class</span><span style=\"color: #000000; text-decoration-color: #000000\"> </span><span style=\"color: #008000; text-decoration-color: #008000\">'dict'</span><span style=\"font-weight: bold\">&gt;</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1m<\u001b[0m\u001b[1;95mclass\u001b[0m\u001b[39m \u001b[0m\u001b[32m'dict'\u001b[0m\u001b[1m>\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "type(labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c77215cb",
   "metadata": {},
   "source": [
    "Raw input data is always treated as a [partitioned dataset](https://kedro.readthedocs.io/en/stable/data/kedro_io.html#partitioned-dataset). This means that both features and labels are loaded by `eensight` as dictionaries with file names as keys and load functions as values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "99f0a6ef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2015-11-18 18:45:00</td>\n",
       "      <td>4671.259736</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2015-11-18 19:00:00</td>\n",
       "      <td>4211.127331</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2015-11-18 19:15:00</td>\n",
       "      <td>4393.182131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>2015-11-18 19:30:00</td>\n",
       "      <td>4562.470893</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>2015-11-18 19:45:00</td>\n",
       "      <td>4535.828727</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "consumption = []\n",
    "\n",
    "for load_fn in labels.values():\n",
    "    consumption.append(load_fn())\n",
    "\n",
    "consumption = pd.concat(consumption, axis=0)\n",
    "consumption.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b96be0e",
   "metadata": {},
   "source": [
    "**Check if a column with the name `timestamp` exists**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8931c57f",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert check_column_exists(consumption, \"timestamp\").success"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "35528048",
   "metadata": {},
   "source": [
    "**Parse the contents of the `timestamp` column as dates**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "09be234d",
   "metadata": {},
   "outputs": [],
   "source": [
    "date_format = \"%Y-%m-%d %H:%M:%S\"\n",
    "\n",
    "if not check_column_type_datetime(consumption, \"timestamp\").success:\n",
    "    try:\n",
    "        consumption[\"timestamp\"] = pd.to_datetime(consumption[\"timestamp\"], format=date_format)\n",
    "        consumption = consumption.dropna(subset=\"timestamp\")\n",
    "    except ValueError:\n",
    "        raise ValueError(f\"Column `timestamp` must be in datetime format\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73d46ec3",
   "metadata": {},
   "source": [
    "**Sort the the contents of the `timestamp` column if they are not already in an increasing order**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "163671eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not check_column_values_increasing(consumption, \"timestamp\").success:\n",
    "    consumption = consumption.sort_values(by=[\"timestamp\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4d202f0b",
   "metadata": {},
   "source": [
    "**Check that the values of the `timestamp` column are unique, and if they are not, remove duplicate dates**\n",
    "\n",
    "We can test this functionality by adding some duplicate rows to the data. Half of these rows correspond to consumption values that differ more than 0.25 (default value of `threshold`) times the standard deviation of the data. These should be replaced by `NaN` at the end of this task."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "30d5f5a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_duplicate = 100\n",
    "n_out_of_range = 50\n",
    "nan_before = consumption[\"consumption\"].isna()\n",
    "\n",
    "consumption_with_dup = pd.concat(\n",
    "    (consumption, consumption[~nan_before].sample(n=n_duplicate, replace=False)),\n",
    "    axis=0,\n",
    "    ignore_index=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "68596867",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert not check_column_values_unique(consumption_with_dup, \"timestamp\").success"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "946b9e3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_std = consumption_with_dup[\"consumption\"].std()\n",
    "\n",
    "for i, (_, grouped) in enumerate(\n",
    "        consumption_with_dup[consumption_with_dup.duplicated(subset=\"timestamp\")].groupby(\n",
    "            \"timestamp\"\n",
    "        )\n",
    "):\n",
    "    if i < n_out_of_range:\n",
    "        consumption_with_dup.loc[grouped.index[0], \"consumption\"] = (\n",
    "            grouped[\"consumption\"].iloc[0] + 2 * data_std\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d0ff10c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "consumption_no_dup = remove_duplicate_dates(consumption_with_dup, \"timestamp\", threshold=0.25)\n",
    "\n",
    "assert check_column_values_unique(consumption_no_dup, \"timestamp\").success\n",
    "assert consumption_no_dup[\"consumption\"].isna().sum() == n_out_of_range + nan_before.sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73da0349",
   "metadata": {},
   "source": [
    "Finally, the `timestamp` column becomes the dataframe's index, and columns including \"Unnamed\" in their name are dropped:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "4af5595e",
   "metadata": {},
   "outputs": [],
   "source": [
    "consumption = consumption.set_index(\"timestamp\")\n",
    "\n",
    "to_drop = consumption.filter(like=\"Unnamed\", axis=1).columns\n",
    "if len(to_drop) > 0:\n",
    "    consumption = consumption.drop(to_drop, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d1c2be5",
   "metadata": {},
   "source": [
    "All, the aforementioned tasks are carried out by the `eensight.methods.preprocessing.validation.validate_dataset` function."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e5be1ab",
   "metadata": {},
   "source": [
    "### Validate features\n",
    "\n",
    "The features of this dataset include two files:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "5567a58b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">dict_keys</span><span style=\"font-weight: bold\">([</span><span style=\"color: #008000; text-decoration-color: #008000\">'holidays.csv'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'temperature.csv'</span><span style=\"font-weight: bold\">])</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1;35mdict_keys\u001b[0m\u001b[1m(\u001b[0m\u001b[1m[\u001b[0m\u001b[32m'holidays.csv'\u001b[0m, \u001b[32m'temperature.csv'\u001b[0m\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "features.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "63b15730",
   "metadata": {},
   "source": [
    "Each file is separately validated:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1efcf8ab",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>holiday</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>timestamp</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2015-01-01</th>\n",
       "      <td>New year</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-01-06</th>\n",
       "      <td>Epiphany</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-06</th>\n",
       "      <td>Easter Monday</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-04-25</th>\n",
       "      <td>Liberation Day</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-05-01</th>\n",
       "      <td>International Workers' Day</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "load_fn = features[\"holidays.csv\"]\n",
    "holidays = load_fn()\n",
    "holidays = validate_dataset(holidays)\n",
    "holidays.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "bf73bb05",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert check_column_type_datetime(holidays.reset_index(), \"timestamp\").success\n",
    "assert check_column_values_increasing(holidays.reset_index(), \"timestamp\").success\n",
    "assert check_column_values_unique(holidays.reset_index(), \"timestamp\").success"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "632a14a0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>temperature</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>timestamp</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2015-12-07 12:00:00</th>\n",
       "      <td>14.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-12-07 13:00:00</th>\n",
       "      <td>15.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-12-07 14:00:00</th>\n",
       "      <td>16.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-12-07 15:00:00</th>\n",
       "      <td>16.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2015-12-07 16:00:00</th>\n",
       "      <td>15.8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "load_fn = features[\"temperature.csv\"]\n",
    "temperature = load_fn()\n",
    "temperature = validate_dataset(temperature)\n",
    "temperature.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "2f99b35b",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert check_column_type_datetime(temperature.reset_index(), \"timestamp\").success\n",
    "assert check_column_values_increasing(temperature.reset_index(), \"timestamp\").success\n",
    "assert check_column_values_unique(temperature.reset_index(), \"timestamp\").success"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f2908fc0",
   "metadata": {},
   "source": [
    "### Parameters"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e700128e",
   "metadata": {},
   "source": [
    "The parameters of the input data validation stage - as they can be found in the `eensight/conf/base/parameters/preprocess.yml` file - are:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "aa47298e",
   "metadata": {},
   "outputs": [],
   "source": [
    "params = ConfigLoader(PROJECT_PATH / \"conf\").get(\"parameters*\", \"parameters*/**\", \"**/parameters*\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "26e04a52",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
       "<span style=\"font-weight: bold\">{</span>\n",
       "    <span style=\"color: #008000; text-decoration-color: #008000\">'rebind_names'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'consumption'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'temperature'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'timestamp'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"font-weight: bold\">}</span>,\n",
       "    <span style=\"color: #008000; text-decoration-color: #008000\">'date_format'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'%Y-%m-%d %H:%M:%S'</span>,\n",
       "    <span style=\"color: #008000; text-decoration-color: #008000\">'validation'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'threshold'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.25</span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"font-weight: bold\">}</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\n",
       "\u001b[1m{\u001b[0m\n",
       "    \u001b[32m'rebind_names'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'consumption'\u001b[0m: \u001b[3;35mNone\u001b[0m, \u001b[32m'temperature'\u001b[0m: \u001b[3;35mNone\u001b[0m, \u001b[32m'timestamp'\u001b[0m: \u001b[3;35mNone\u001b[0m\u001b[1m}\u001b[0m,\n",
       "    \u001b[32m'date_format'\u001b[0m: \u001b[32m'%Y-%m-%d %H:%M:%S'\u001b[0m,\n",
       "    \u001b[32m'validation'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'threshold'\u001b[0m: \u001b[1;36m0.25\u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[1m}\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "{\n",
    "    \"rebind_names\": params[\"rebind_names\"],\n",
    "    \"date_format\": params[\"date_format\"],\n",
    "    \"validation\": params[\"validation\"],\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f92ef69",
   "metadata": {},
   "source": [
    "-----------------"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}