Tohoku Dataset
################

The second dataset used in this study is contributed by students at Tohoku
University.


Minimal Example
=================

.. code-block:: python

   # -------------------------------------------------------------------------
   # STEP-1: LOAD LIBRARIES
   # Base libraries
   import os
   from pathlib import Path

   # Database library
   import duckdb

   # Custom osbad library for anomaly detection
   import osbad.config as bconf
   from osbad.database import BenchDB

   # -------------------------------------------------------------------------
   # STEP-2: LOAD CELL INVENTORY FOR CELL_INDEX

   # Path to the DuckDB instance:
   # "train_dataset_severson.db"
   db_filepath = (
      Path.cwd()
      .parent
      .joinpath("database","train_dataset_severson.db"))
   print(db_filepath)

   # Create a DuckDB connection
   con = duckdb.connect(
      db_filepath,
      read_only=True)

   # Load all training dataset from duckdb
   df_duckdb = con.execute(
      "SELECT * FROM df_train_dataset_sv").fetchdf()

   unique_cell_index_train = df_duckdb["cell_index"].unique()
   print(unique_cell_index_train)

   # Get the cell-ID from unique_cell_index_train
   selected_cell_label = "2017-05-12_5_4C-70per_3C_CH17"

   # Create a subfolder to store fig output
   # corresponding to each cell-index
   selected_cell_artifacts_dir = bconf.artifacts_output_dir(
      selected_cell_label)

   # -------------------------------------------------------------------------
   # STEP-3: LOAD BENCHMARKING DATASET

   # Import the BenchDB class
   # Load only the dataset based on the selected cell
   benchdb = BenchDB(
      db_filepath,
      selected_cell_label)

   # load the benchmarking dataset
   df_selected_cell = benchdb.load_benchmark_dataset(
      dataset_type="train")

   if df_selected_cell is not None:

      filter_col = [
         "cell_index",
         "cycle_index",
         "discharge_capacity",
         "voltage"]

      # Drop true labels from the benchmarking dataset
      # and filter for selected columns only
      df_selected_cell_without_labels = benchdb.drop_labels(
         df_selected_cell,
         filter_col)

      # Extract true outliers cycle index from benchmarking dataset
      true_outlier_cycle_index = benchdb.get_true_outlier_cycle_index(
         df_selected_cell)
      print(f"True outlier cycle index:")
      print(true_outlier_cycle_index)

   # -------------------------------------------------------------------------
   # STEP-4: PLOT CYCLING DATASET

   # Plot cell data with true anomalies
   # If the true outlier cycle index is not known,
   # cycling data will be plotted without labels
   benchdb.plot_cycle_data(
      df_selected_cell_without_labels,
      true_outlier_cycle_index)

.. image:: docs_figure/cell_cycle_2017-05-12_5_4C-70per_3C_CH17.png
   :height: 450px
   :width: 650 px
   :alt: cell cycling dataset from ``2017-05-12_5_4C-70per_3C_CH17``
   :align: center