Skip to content

Data API Reference

The data module provides utilities for discovering experimental files, resolving symbolic paths for the LG M50T dataset, and performing unit conversions (e.g., mAh to Ah, °C to K). It also implements various data splitting strategies for cross-validation and holdout testing.

expt_paths

Path resolution for experiment data following Dataset.md conventions.

Classes

ExperimentPaths dataclass

ExperimentPaths(experiment_id: int, base_path: Path = lambda: Path('Raw Data')())

Resolves all paths for a given experiment following Dataset.md conventions.

Handles differences between experiments: - Expt 5 uses "Cell A", Expt 1-4 use "cell A" - Different folder naming conventions

Example usage

paths = ExperimentPaths(5, Path("Raw Data")) summary_path = paths.performance_summary("A", 10) print(summary_path)

Functions
exists
exists() -> bool

Check if the experiment directory exists.

Returns:

Type Description
bool

True if directory exists

Source code in src/data/expt_paths.py
def exists(self) -> bool:
    """Check if the experiment directory exists.

    Returns:
        True if directory exists
    """
    return self.expt_path.exists()
list_all_files
list_all_files(pattern: str = '*.csv') -> List[Path]

List all files matching pattern in experiment directory.

Parameters:

Name Type Description Default
pattern str

Glob pattern (default: "*.csv")

'*.csv'

Returns:

Type Description
List[Path]

List of matching file paths

Source code in src/data/expt_paths.py
def list_all_files(self, pattern: str = "*.csv") -> List[Path]:
    """List all files matching pattern in experiment directory.

    Args:
        pattern: Glob pattern (default: "*.csv")

    Returns:
        List of matching file paths
    """
    if not self.expt_path.exists():
        return []
    return list(self.expt_path.rglob(pattern))
list_available_rpts
list_available_rpts(cell_id: str, curve_type: str = '0.1C') -> List[int]

List all available RPT indices for a cell.

Parameters:

Name Type Description Default
cell_id str

Cell identifier

required
curve_type str

Type of curve

'0.1C'

Returns:

Type Description
List[int]

Sorted list of available RPT indices

Source code in src/data/expt_paths.py
def list_available_rpts(self, cell_id: str, curve_type: str = "0.1C") -> List[int]:
    """List all available RPT indices for a cell.

    Args:
        cell_id: Cell identifier
        curve_type: Type of curve

    Returns:
        Sorted list of available RPT indices
    """
    cell_dir = f"{self.cell_prefix} {cell_id}"
    curve_dir = (self.expt_path / "Processed Timeseries Data" / 
                 f"{curve_type} Voltage Curves" / cell_dir)

    if not curve_dir.exists():
        return []

    rpts = []
    for f in curve_dir.glob(f"*RPT*discharge*.csv"):
        match = re.search(r'RPT(\d+)', f.name)
        if match:
            rpts.append(int(match.group(1)))
    return sorted(rpts)
performance_summary
performance_summary(cell_id: str, temp_C: int) -> Path

Performance Summary CSV (set-level health indicators).

Parameters:

Name Type Description Default
cell_id str

Cell identifier ('A', 'B', ..., 'H')

required
temp_C int

Temperature in Celsius

required

Returns:

Type Description
Path

Path to the Performance Summary CSV file

Source code in src/data/expt_paths.py
def performance_summary(self, cell_id: str, temp_C: int) -> Path:
    """Performance Summary CSV (set-level health indicators).

    Args:
        cell_id: Cell identifier ('A', 'B', ..., 'H')
        temp_C: Temperature in Celsius

    Returns:
        Path to the Performance Summary CSV file
    """
    # Experiment 2 uses "2,2" in filenames, not "2"
    exp_label = "2,2" if self.experiment_id == 2 else str(self.experiment_id)
    filename = f"Expt {exp_label} - cell {cell_id} ({temp_C}degC) - Processed Data.csv"
    return self.expt_path / "Summary Data" / "Performance Summary" / filename
summary_per_cycle
summary_per_cycle(cell_id: str) -> Path

Summary per Cycle CSV (cycle-level metrics).

Parameters:

Name Type Description Default
cell_id str

Cell identifier

required

Returns:

Type Description
Path

Path to the Summary per Cycle CSV file

Source code in src/data/expt_paths.py
def summary_per_cycle(self, cell_id: str) -> Path:
    """Summary per Cycle CSV (cycle-level metrics).

    Args:
        cell_id: Cell identifier

    Returns:
        Path to the Summary per Cycle CSV file
    """
    # Experiment 2 uses "2,2" in filenames, not "2"
    exp_label = "2,2" if self.experiment_id == 2 else str(self.experiment_id)
    filename = f"expt {exp_label} - cell {cell_id} - cycle_data.csv"
    return (self.expt_path / "Summary Data" / "Ageing Sets Summary" / 
            "Summary per Cycle" / filename)
summary_per_set
summary_per_set(cell_id: str) -> Path

Summary per Set CSV.

Parameters:

Name Type Description Default
cell_id str

Cell identifier

required

Returns:

Type Description
Path

Path to the Summary per Set CSV file

Source code in src/data/expt_paths.py
def summary_per_set(self, cell_id: str) -> Path:
    """Summary per Set CSV.

    Args:
        cell_id: Cell identifier

    Returns:
        Path to the Summary per Set CSV file
    """
    # Experiment 2 uses "2,2" in filenames, not "2"
    exp_label = "2,2" if self.experiment_id == 2 else str(self.experiment_id)
    filename = f"expt {exp_label} - cell {cell_id} - set_data.csv"
    return (self.expt_path / "Summary Data" / "Ageing Sets Summary" / 
            "Summary per Set" / filename)
voltage_curve
voltage_curve(cell_id: str, rpt: int, curve_type: str = '0.1C', direction: str = 'discharge') -> Path

Processed voltage curve CSV.

Parameters:

Name Type Description Default
cell_id str

Cell identifier

required
rpt int

RPT measurement index

required
curve_type str

Type of curve (e.g., "0.1C")

'0.1C'
direction str

"discharge" or "charge"

'discharge'

Returns:

Type Description
Path

Path to the voltage curve CSV file

Source code in src/data/expt_paths.py
def voltage_curve(self, cell_id: str, rpt: int, 
                  curve_type: str = "0.1C",
                  direction: str = "discharge") -> Path:
    """Processed voltage curve CSV.

    Args:
        cell_id: Cell identifier
        rpt: RPT measurement index
        curve_type: Type of curve (e.g., "0.1C")
        direction: "discharge" or "charge"

    Returns:
        Path to the voltage curve CSV file
    """
    cell_dir = f"{self.cell_prefix} {cell_id}"
    # Experiment 2 uses "2,2" in filenames, not "2"
    exp_label = "2,2" if self.experiment_id == 2 else str(self.experiment_id)
    filename = f"Expt {exp_label} - cell {cell_id} - RPT{rpt} - {curve_type} {direction} data.csv"
    return (self.expt_path / "Processed Timeseries Data" / 
            f"{curve_type} Voltage Curves" / cell_dir / filename)

tables

Data loaders for summary CSV files.

Classes

SummaryDataLoader

SummaryDataLoader(experiment_id: int, base_path: Path)

Load and normalize summary CSV data.

Example usage

loader = SummaryDataLoader(5, Path("Raw Data")) df = loader.load_all_cells( ... cells=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'], ... temp_map={10: ['A', 'B', 'C'], 25: ['D', 'E'], 40: ['F', 'G', 'H']} ... )

Initialize the loader.

Parameters:

Name Type Description Default
experiment_id int

Experiment ID (1-5)

required
base_path Path

Base path to raw data

required
Source code in src/data/tables.py
def __init__(self, experiment_id: int, base_path: Path):
    """Initialize the loader.

    Args:
        experiment_id: Experiment ID (1-5)
        base_path: Base path to raw data
    """
    self.paths = ExperimentPaths(experiment_id, Path(base_path))
    self.experiment_id = experiment_id
Functions
get_available_cells
get_available_cells() -> List[str]

Get list of cells with available data.

Returns:

Type Description
List[str]

List of cell IDs that have data files

Source code in src/data/tables.py
def get_available_cells(self) -> List[str]:
    """Get list of cells with available data.

    Returns:
        List of cell IDs that have data files
    """
    available = []

    for cell_id in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']:
        # Check common temperatures
        for temp_C in [10, 25, 40]:
            path = self.paths.performance_summary(cell_id, temp_C)
            if path.exists():
                available.append(cell_id)
                break

    return available
load_all_cells
load_all_cells(cells: List[str], temp_map: Dict[int, List[str]]) -> pd.DataFrame

Load performance summary for all specified cells.

Parameters:

Name Type Description Default
cells List[str]

List of cell IDs to load

required
temp_map Dict[int, List[str]]

Mapping from temperature (°C) to cell IDs

required

Returns:

Type Description
DataFrame

Combined DataFrame with all cells

Source code in src/data/tables.py
def load_all_cells(self, cells: List[str], 
                   temp_map: Dict[int, List[str]]) -> pd.DataFrame:
    """Load performance summary for all specified cells.

    Args:
        cells: List of cell IDs to load
        temp_map: Mapping from temperature (°C) to cell IDs

    Returns:
        Combined DataFrame with all cells
    """
    dfs = []

    for temp_C, temp_cells in temp_map.items():
        for cell_id in temp_cells:
            if cell_id in cells:
                try:
                    df = self.load_performance_summary(cell_id, temp_C)
                    dfs.append(df)
                    logger.info(f"Loaded cell {cell_id} at {temp_C}°C: {len(df)} samples")
                except FileNotFoundError as e:
                    logger.warning(f"Skipping cell {cell_id}: {e}")

    if not dfs:
        raise ValueError("No data loaded. Check paths and cell IDs.")

    combined = pd.concat(dfs, ignore_index=True)
    logger.info(f"Loaded {len(combined)} total samples from {len(dfs)} cells")

    return combined
load_performance_summary
load_performance_summary(cell_id: str, temp_C: int) -> pd.DataFrame

Load Performance Summary with unit normalization.

Parameters:

Name Type Description Default
cell_id str

Cell identifier ('A', 'B', ..., 'H')

required
temp_C int

Temperature in Celsius

required

Returns:

Type Description
DataFrame

DataFrame with normalized units and metadata columns

Raises:

Type Description
FileNotFoundError

If the summary file doesn't exist

Source code in src/data/tables.py
def load_performance_summary(self, cell_id: str, temp_C: int) -> pd.DataFrame:
    """Load Performance Summary with unit normalization.

    Args:
        cell_id: Cell identifier ('A', 'B', ..., 'H')
        temp_C: Temperature in Celsius

    Returns:
        DataFrame with normalized units and metadata columns

    Raises:
        FileNotFoundError: If the summary file doesn't exist
    """
    path = self.paths.performance_summary(cell_id, temp_C)

    if not path.exists():
        raise FileNotFoundError(f"Performance summary not found: {path}")

    logger.debug(f"Loading performance summary: {path}")
    df = pd.read_csv(path, index_col=0)

    # Normalize capacity columns to Ah
    df = UnitConverter.normalize_all_capacity_columns(df)

    # Add metadata
    df['cell_id'] = cell_id
    df['temperature_C'] = temp_C
    df['experiment_id'] = self.experiment_id

    return df
load_summary_per_cycle
load_summary_per_cycle(cell_id: str) -> pd.DataFrame

Load cycle-level summary with unit normalization.

Parameters:

Name Type Description Default
cell_id str

Cell identifier

required

Returns:

Type Description
DataFrame

DataFrame with cycle-level metrics

Raises:

Type Description
FileNotFoundError

If the summary file doesn't exist

Source code in src/data/tables.py
def load_summary_per_cycle(self, cell_id: str) -> pd.DataFrame:
    """Load cycle-level summary with unit normalization.

    Args:
        cell_id: Cell identifier

    Returns:
        DataFrame with cycle-level metrics

    Raises:
        FileNotFoundError: If the summary file doesn't exist
    """
    path = self.paths.summary_per_cycle(cell_id)

    if not path.exists():
        raise FileNotFoundError(f"Cycle summary not found: {path}")

    logger.debug(f"Loading cycle summary: {path}")
    df = pd.read_csv(path)

    # Normalize units
    df = UnitConverter.normalize_all_capacity_columns(df)

    df['cell_id'] = cell_id
    df['experiment_id'] = self.experiment_id

    return df
load_summary_per_set
load_summary_per_set(cell_id: str) -> pd.DataFrame

Load set-level summary with unit normalization.

Parameters:

Name Type Description Default
cell_id str

Cell identifier

required

Returns:

Type Description
DataFrame

DataFrame with set-level metrics

Raises:

Type Description
FileNotFoundError

If the summary file doesn't exist

Source code in src/data/tables.py
def load_summary_per_set(self, cell_id: str) -> pd.DataFrame:
    """Load set-level summary with unit normalization.

    Args:
        cell_id: Cell identifier

    Returns:
        DataFrame with set-level metrics

    Raises:
        FileNotFoundError: If the summary file doesn't exist
    """
    path = self.paths.summary_per_set(cell_id)

    if not path.exists():
        raise FileNotFoundError(f"Set summary not found: {path}")

    logger.debug(f"Loading set summary: {path}")
    df = pd.read_csv(path)

    # Normalize units
    df = UnitConverter.normalize_all_capacity_columns(df)

    df['cell_id'] = cell_id
    df['experiment_id'] = self.experiment_id

    return df

TimeseriesDataLoader

TimeseriesDataLoader(experiment_id: int, base_path: Path)

Load voltage curve timeseries data.

Initialize the loader.

Parameters:

Name Type Description Default
experiment_id int

Experiment ID (1-5)

required
base_path Path

Base path to raw data

required
Source code in src/data/tables.py
def __init__(self, experiment_id: int, base_path: Path):
    """Initialize the loader.

    Args:
        experiment_id: Experiment ID (1-5)
        base_path: Base path to raw data
    """
    self.paths = ExperimentPaths(experiment_id, Path(base_path))
    self.experiment_id = experiment_id
Functions
load_all_curves
load_all_curves(cell_id: str, curve_type: str = '0.1C', direction: str = 'discharge') -> Dict[int, pd.DataFrame]

Load all available voltage curves for a cell.

Parameters:

Name Type Description Default
cell_id str

Cell identifier

required
curve_type str

Type of curve

'0.1C'
direction str

"discharge" or "charge"

'discharge'

Returns:

Type Description
Dict[int, DataFrame]

Dictionary mapping RPT index to DataFrame

Source code in src/data/tables.py
def load_all_curves(self, cell_id: str, 
                    curve_type: str = "0.1C",
                    direction: str = "discharge") -> Dict[int, pd.DataFrame]:
    """Load all available voltage curves for a cell.

    Args:
        cell_id: Cell identifier
        curve_type: Type of curve
        direction: "discharge" or "charge"

    Returns:
        Dictionary mapping RPT index to DataFrame
    """
    rpts = self.paths.list_available_rpts(cell_id, curve_type)

    curves = {}
    for rpt in rpts:
        try:
            curves[rpt] = self.load_voltage_curve(cell_id, rpt, curve_type, direction)
        except FileNotFoundError:
            logger.warning(f"Could not load RPT {rpt} for cell {cell_id}")

    logger.info(f"Loaded {len(curves)} curves for cell {cell_id}")
    return curves
load_voltage_curve
load_voltage_curve(cell_id: str, rpt: int, curve_type: str = '0.1C', direction: str = 'discharge') -> pd.DataFrame

Load a single voltage curve.

Parameters:

Name Type Description Default
cell_id str

Cell identifier

required
rpt int

RPT measurement index

required
curve_type str

Type of curve (e.g., "0.1C")

'0.1C'
direction str

"discharge" or "charge"

'discharge'

Returns:

Type Description
DataFrame

DataFrame with voltage curve data

Raises:

Type Description
FileNotFoundError

If the curve file doesn't exist

Source code in src/data/tables.py
def load_voltage_curve(self, cell_id: str, rpt: int,
                       curve_type: str = "0.1C",
                       direction: str = "discharge") -> pd.DataFrame:
    """Load a single voltage curve.

    Args:
        cell_id: Cell identifier
        rpt: RPT measurement index
        curve_type: Type of curve (e.g., "0.1C")
        direction: "discharge" or "charge"

    Returns:
        DataFrame with voltage curve data

    Raises:
        FileNotFoundError: If the curve file doesn't exist
    """
    path = self.paths.voltage_curve(cell_id, rpt, curve_type, direction)

    if not path.exists():
        raise FileNotFoundError(f"Voltage curve not found: {path}")

    logger.debug(f"Loading voltage curve: {path}")
    df = pd.read_csv(path)

    # Typical columns: Voltage [V], Capacity [mA h], ...
    df = UnitConverter.normalize_all_capacity_columns(df)

    df['cell_id'] = cell_id
    df['rpt_id'] = rpt
    df['experiment_id'] = self.experiment_id

    return df

splits

Data split strategies for battery degradation experiments.

Classes

Functions

leave_one_cell_out

leave_one_cell_out(samples: List[Sample], test_cell: str) -> Tuple[List[Sample], List[Sample]]

Leave-one-cell-out split.

Use for testing generalization to unseen cells.

Parameters:

Name Type Description Default
samples List[Sample]

List of Sample objects

required
test_cell str

Cell ID to hold out for testing

required

Returns:

Type Description
Tuple[List[Sample], List[Sample]]

Tuple of (train_samples, test_samples)

Example

train, test = leave_one_cell_out(samples, test_cell='A')

Source code in src/data/splits.py
def leave_one_cell_out(samples: List[Sample], 
                       test_cell: str) -> Tuple[List[Sample], List[Sample]]:
    """Leave-one-cell-out split.

    Use for testing generalization to unseen cells.

    Args:
        samples: List of Sample objects
        test_cell: Cell ID to hold out for testing

    Returns:
        Tuple of (train_samples, test_samples)

    Example:
        >>> train, test = leave_one_cell_out(samples, test_cell='A')
    """
    train = [s for s in samples if s.meta.get('cell_id') != test_cell]
    test = [s for s in samples if s.meta.get('cell_id') == test_cell]

    return train, test

loco_cv_splits

loco_cv_splits(samples: List[Sample]) -> List[Tuple[str, List[Sample], List[Sample]]]

Generate all leave-one-cell-out cross-validation splits.

Parameters:

Name Type Description Default
samples List[Sample]

List of Sample objects

required

Returns:

Type Description
List[Tuple[str, List[Sample], List[Sample]]]

List of (cell_id, train_samples, test_samples) tuples

Example

for cell_id, train, test in loco_cv_splits(samples): ... model.fit(train) ... score = model.evaluate(test)

Source code in src/data/splits.py
def loco_cv_splits(samples: List[Sample]) -> List[Tuple[str, List[Sample], List[Sample]]]:
    """Generate all leave-one-cell-out cross-validation splits.

    Args:
        samples: List of Sample objects

    Returns:
        List of (cell_id, train_samples, test_samples) tuples

    Example:
        >>> for cell_id, train, test in loco_cv_splits(samples):
        ...     model.fit(train)
        ...     score = model.evaluate(test)
    """
    cells = sorted(set(s.meta.get('cell_id') for s in samples if 'cell_id' in s.meta))

    splits = []
    for cell in cells:
        train, test = leave_one_cell_out(samples, cell)
        splits.append((cell, train, test))

    return splits

random_split

random_split(samples: List[Sample], train_fraction: float = 0.7, val_fraction: float = 0.15, seed: int = 42) -> Tuple[List[Sample], List[Sample], List[Sample]]

Random split with fixed seed.

Parameters:

Name Type Description Default
samples List[Sample]

List of Sample objects

required
train_fraction float

Fraction for training

0.7
val_fraction float

Fraction for validation

0.15
seed int

Random seed for reproducibility

42

Returns:

Type Description
Tuple[List[Sample], List[Sample], List[Sample]]

Tuple of (train_samples, val_samples, test_samples)

Source code in src/data/splits.py
def random_split(samples: List[Sample],
                 train_fraction: float = 0.7,
                 val_fraction: float = 0.15,
                 seed: int = 42) -> Tuple[List[Sample], List[Sample], List[Sample]]:
    """Random split with fixed seed.

    Args:
        samples: List of Sample objects
        train_fraction: Fraction for training
        val_fraction: Fraction for validation
        seed: Random seed for reproducibility

    Returns:
        Tuple of (train_samples, val_samples, test_samples)
    """
    import random

    rng = random.Random(seed)
    shuffled = samples.copy()
    rng.shuffle(shuffled)

    n = len(shuffled)
    train_end = int(n * train_fraction)
    val_end = int(n * (train_fraction + val_fraction))

    train = shuffled[:train_end]
    val = shuffled[train_end:val_end]
    test = shuffled[val_end:]

    return train, val, test

stratified_temperature_split

stratified_temperature_split(samples: List[Sample], val_fraction: float = 0.2, seed: int = 42) -> Tuple[List[Sample], List[Sample]]

Stratified split maintaining temperature distribution.

Parameters:

Name Type Description Default
samples List[Sample]

List of Sample objects

required
val_fraction float

Fraction for validation

0.2
seed int

Random seed

42

Returns:

Type Description
Tuple[List[Sample], List[Sample]]

Tuple of (train_samples, val_samples)

Source code in src/data/splits.py
def stratified_temperature_split(samples: List[Sample],
                                  val_fraction: float = 0.2,
                                  seed: int = 42) -> Tuple[List[Sample], List[Sample]]:
    """Stratified split maintaining temperature distribution.

    Args:
        samples: List of Sample objects
        val_fraction: Fraction for validation
        seed: Random seed

    Returns:
        Tuple of (train_samples, val_samples)
    """
    import random

    rng = random.Random(seed)

    # Group by temperature
    by_temp: Dict[int, List[Sample]] = {}
    for s in samples:
        temp = s.meta.get('temperature_C', 25)
        if temp not in by_temp:
            by_temp[temp] = []
        by_temp[temp].append(s)

    train, val = [], []

    for temp, temp_samples in by_temp.items():
        shuffled = temp_samples.copy()
        rng.shuffle(shuffled)

        n_val = max(1, int(len(shuffled) * val_fraction))
        val.extend(shuffled[:n_val])
        train.extend(shuffled[n_val:])

    return train, val

temperature_split

temperature_split(samples: List[Sample], train_temps: List[int], val_temps: List[int]) -> Tuple[List[Sample], List[Sample]]

Split samples by temperature.

Default for Expt 5: train on [10, 40], val on [25]. Tests temperature interpolation capability.

Parameters:

Name Type Description Default
samples List[Sample]

List of Sample objects

required
train_temps List[int]

Temperatures for training (e.g., [10, 40])

required
val_temps List[int]

Temperatures for validation (e.g., [25])

required

Returns:

Type Description
Tuple[List[Sample], List[Sample]]

Tuple of (train_samples, val_samples)

Example

train, val = temperature_split(samples, train_temps=[10, 40], val_temps=[25])

Source code in src/data/splits.py
def temperature_split(samples: List[Sample],
                      train_temps: List[int],
                      val_temps: List[int]) -> Tuple[List[Sample], List[Sample]]:
    """Split samples by temperature.

    Default for Expt 5: train on [10, 40], val on [25].
    Tests temperature interpolation capability.

    Args:
        samples: List of Sample objects
        train_temps: Temperatures for training (e.g., [10, 40])
        val_temps: Temperatures for validation (e.g., [25])

    Returns:
        Tuple of (train_samples, val_samples)

    Example:
        >>> train, val = temperature_split(samples, train_temps=[10, 40], val_temps=[25])
    """
    train = [s for s in samples if s.meta.get('temperature_C') in train_temps]
    val = [s for s in samples if s.meta.get('temperature_C') in val_temps]

    return train, val

temporal_split

temporal_split(samples: List[Sample], train_fraction: float = 0.7, val_fraction: float = 0.15) -> Tuple[List[Sample], List[Sample], List[Sample]]

Split samples temporally (early cycles for train, later for val/test).

Useful for testing extrapolation to future degradation states.

Parameters:

Name Type Description Default
samples List[Sample]

List of Sample objects (should have 'set_idx' or 'cycle_idx' in meta)

required
train_fraction float

Fraction of samples for training

0.7
val_fraction float

Fraction of samples for validation

0.15

Returns:

Type Description
Tuple[List[Sample], List[Sample], List[Sample]]

Tuple of (train_samples, val_samples, test_samples)

Source code in src/data/splits.py
def temporal_split(samples: List[Sample],
                   train_fraction: float = 0.7,
                   val_fraction: float = 0.15) -> Tuple[List[Sample], List[Sample], List[Sample]]:
    """Split samples temporally (early cycles for train, later for val/test).

    Useful for testing extrapolation to future degradation states.

    Args:
        samples: List of Sample objects (should have 'set_idx' or 'cycle_idx' in meta)
        train_fraction: Fraction of samples for training
        val_fraction: Fraction of samples for validation

    Returns:
        Tuple of (train_samples, val_samples, test_samples)
    """
    # Sort by time index
    def get_time_idx(s: Sample) -> int:
        return s.meta.get('set_idx', s.meta.get('cycle_idx', 0))

    sorted_samples = sorted(samples, key=get_time_idx)
    n = len(sorted_samples)

    train_end = int(n * train_fraction)
    val_end = int(n * (train_fraction + val_fraction))

    train = sorted_samples[:train_end]
    val = sorted_samples[train_end:val_end]
    test = sorted_samples[val_end:]

    return train, val, test

units

Centralized unit conversions for battery data.

Call these ONCE during data loading to ensure consistent internal units.

Classes

UnitConverter

Ensures all data uses consistent units internally.

Internal units (after conversion): - Capacity: Ah (not mAh) - Current: A (not mA) - Temperature: K (for Arrhenius calculations) - Time: seconds (or days for long-term analysis) - Resistance: Ohms

Example usage

capacity_mAh = 4800 capacity_Ah = UnitConverter.mAh_to_Ah(capacity_mAh) print(capacity_Ah) # 4.8

Functions
A_to_mA staticmethod
A_to_mA(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]

Convert amps to milliamps.

Parameters:

Name Type Description Default
value Union[float, ndarray, Series]

Value(s) in A

required

Returns:

Type Description
Union[float, ndarray, Series]

Value(s) in mA

Source code in src/data/units.py
@staticmethod
def A_to_mA(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
    """Convert amps to milliamps.

    Args:
        value: Value(s) in A

    Returns:
        Value(s) in mA
    """
    return value * 1000.0
Ah_to_mAh staticmethod
Ah_to_mAh(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]

Convert amp-hours to milliamp-hours.

Parameters:

Name Type Description Default
value Union[float, ndarray, Series]

Value(s) in Ah

required

Returns:

Type Description
Union[float, ndarray, Series]

Value(s) in mAh

Source code in src/data/units.py
@staticmethod
def Ah_to_mAh(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
    """Convert amp-hours to milliamp-hours.

    Args:
        value: Value(s) in Ah

    Returns:
        Value(s) in mAh
    """
    return value * 1000.0
celsius_to_kelvin staticmethod
celsius_to_kelvin(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]

Convert Celsius to Kelvin.

Parameters:

Name Type Description Default
value Union[float, ndarray, Series]

Temperature(s) in °C

required

Returns:

Type Description
Union[float, ndarray, Series]

Temperature(s) in K

Source code in src/data/units.py
@staticmethod
def celsius_to_kelvin(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
    """Convert Celsius to Kelvin.

    Args:
        value: Temperature(s) in °C

    Returns:
        Temperature(s) in K
    """
    return value + 273.15
compute_arrhenius_factor staticmethod
compute_arrhenius_factor(temp_K: Union[float, np.ndarray], Ea: float = 50000.0) -> Union[float, np.ndarray]

Compute Arrhenius factor exp(-Ea/RT).

Parameters:

Name Type Description Default
temp_K Union[float, ndarray]

Temperature in Kelvin

required
Ea float

Activation energy in J/mol (default: 50000)

50000.0

Returns:

Type Description
Union[float, ndarray]

Arrhenius factor

Source code in src/data/units.py
@staticmethod
def compute_arrhenius_factor(temp_K: Union[float, np.ndarray], 
                               Ea: float = 50000.0) -> Union[float, np.ndarray]:
    """Compute Arrhenius factor exp(-Ea/RT).

    Args:
        temp_K: Temperature in Kelvin
        Ea: Activation energy in J/mol (default: 50000)

    Returns:
        Arrhenius factor
    """
    R = 8.314  # J/(mol·K)
    return np.exp(-Ea / (R * temp_K))
kelvin_to_celsius staticmethod
kelvin_to_celsius(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]

Convert Kelvin to Celsius.

Parameters:

Name Type Description Default
value Union[float, ndarray, Series]

Temperature(s) in K

required

Returns:

Type Description
Union[float, ndarray, Series]

Temperature(s) in °C

Source code in src/data/units.py
@staticmethod
def kelvin_to_celsius(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
    """Convert Kelvin to Celsius.

    Args:
        value: Temperature(s) in K

    Returns:
        Temperature(s) in °C
    """
    return value - 273.15
mA_to_A staticmethod
mA_to_A(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]

Convert milliamps to amps.

Parameters:

Name Type Description Default
value Union[float, ndarray, Series]

Value(s) in mA

required

Returns:

Type Description
Union[float, ndarray, Series]

Value(s) in A

Source code in src/data/units.py
@staticmethod
def mA_to_A(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
    """Convert milliamps to amps.

    Args:
        value: Value(s) in mA

    Returns:
        Value(s) in A
    """
    return value / 1000.0
mAh_to_Ah staticmethod
mAh_to_Ah(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]

Convert milliamp-hours to amp-hours.

Parameters:

Name Type Description Default
value Union[float, ndarray, Series]

Value(s) in mAh

required

Returns:

Type Description
Union[float, ndarray, Series]

Value(s) in Ah

Source code in src/data/units.py
@staticmethod
def mAh_to_Ah(value: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
    """Convert milliamp-hours to amp-hours.

    Args:
        value: Value(s) in mAh

    Returns:
        Value(s) in Ah
    """
    return value / 1000.0
normalize_all_capacity_columns staticmethod
normalize_all_capacity_columns(df: pd.DataFrame) -> pd.DataFrame

Normalize all capacity-related columns to Ah.

Parameters:

Name Type Description Default
df DataFrame

DataFrame to normalize

required

Returns:

Type Description
DataFrame

DataFrame with all capacity columns normalized

Source code in src/data/units.py
@staticmethod
def normalize_all_capacity_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize all capacity-related columns to Ah.

    Args:
        df: DataFrame to normalize

    Returns:
        DataFrame with all capacity columns normalized
    """
    df = df.copy()

    for col in df.columns:
        if any(kw in col.lower() for kw in ['capacity', 'throughput', 'charge', 'discharge']):
            df = UnitConverter.normalize_capacity_column(df, col)

    return df
normalize_capacity_column staticmethod
normalize_capacity_column(df: pd.DataFrame, col: str) -> pd.DataFrame

Auto-detect mAh vs Ah and normalize to Ah.

Heuristic: if column name contains 'mA' or mean value > 100, assume mAh and convert.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing the column

required
col str

Column name to normalize

required

Returns:

Type Description
DataFrame

DataFrame with normalized column (modified copy)

Source code in src/data/units.py
@staticmethod
def normalize_capacity_column(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """Auto-detect mAh vs Ah and normalize to Ah.

    Heuristic: if column name contains 'mA' or mean value > 100, 
    assume mAh and convert.

    Args:
        df: DataFrame containing the column
        col: Column name to normalize

    Returns:
        DataFrame with normalized column (modified copy)
    """
    df = df.copy()

    # Check column name and values
    is_mAh = (
        '[mA h]' in col or 
        '[mAh]' in col or 
        'mAh' in col or
        (col in df.columns and df[col].mean() > 100)
    )

    if is_mAh and col in df.columns:
        df[col] = df[col] / 1000.0

    return df

discovery

File discovery utilities for experiment data.

Classes

Functions

discover_experiment_files

discover_experiment_files(base_path: Path, experiment_id: int) -> Dict[str, List[Path]]

Discover all available data files for an experiment.

Parameters:

Name Type Description Default
base_path Path

Base path to data

required
experiment_id int

Experiment ID (1-5)

required

Returns:

Type Description
Dict[str, List[Path]]

Dictionary with keys:

Dict[str, List[Path]]
  • 'performance_summary': List of performance summary files
Dict[str, List[Path]]
  • 'cycle_summary': List of cycle summary files
Dict[str, List[Path]]
  • 'set_summary': List of set summary files
Dict[str, List[Path]]
  • 'voltage_curves': List of voltage curve files
Source code in src/data/discovery.py
def discover_experiment_files(base_path: Path, 
                               experiment_id: int) -> Dict[str, List[Path]]:
    """Discover all available data files for an experiment.

    Args:
        base_path: Base path to data
        experiment_id: Experiment ID (1-5)

    Returns:
        Dictionary with keys:
        - 'performance_summary': List of performance summary files
        - 'cycle_summary': List of cycle summary files
        - 'set_summary': List of set summary files
        - 'voltage_curves': List of voltage curve files
    """
    paths = ExperimentPaths(experiment_id, base_path)

    result = {
        'performance_summary': [],
        'cycle_summary': [],
        'set_summary': [],
        'voltage_curves': [],
    }

    if not paths.exists():
        logger.warning(f"Experiment directory not found: {paths.expt_path}")
        return result

    # Find performance summaries
    perf_dir = paths.expt_path / "Summary Data" / "Performance Summary"
    if perf_dir.exists():
        result['performance_summary'] = list(perf_dir.glob("*.csv"))

    # Find cycle summaries
    cycle_dir = (paths.expt_path / "Summary Data" / "Ageing Sets Summary" / 
                 "Summary per Cycle")
    if cycle_dir.exists():
        result['cycle_summary'] = list(cycle_dir.glob("*.csv"))

    # Find set summaries
    set_dir = (paths.expt_path / "Summary Data" / "Ageing Sets Summary" / 
               "Summary per Set")
    if set_dir.exists():
        result['set_summary'] = list(set_dir.glob("*.csv"))

    # Find voltage curves
    ts_dir = paths.expt_path / "Processed Timeseries Data"
    if ts_dir.exists():
        result['voltage_curves'] = list(ts_dir.rglob("*.csv"))

    for key, files in result.items():
        logger.info(f"Found {len(files)} {key} files")

    return result

parse_filename_metadata

parse_filename_metadata(filename: str) -> Dict[str, Any]

Extract metadata from standardized filename.

Parameters:

Name Type Description Default
filename str

Filename to parse

required

Returns:

Type Description
Dict[str, Any]

Dictionary with extracted metadata (experiment_id, cell_id, temp_C, rpt_id, etc.)

Example

meta = parse_filename_metadata("Expt 5 - cell A (10degC) - Processed Data.csv") print(meta) # {'experiment_id': 5, 'cell_id': 'A', 'temperature_C': 10}

Source code in src/data/discovery.py
def parse_filename_metadata(filename: str) -> Dict[str, Any]:
    """Extract metadata from standardized filename.

    Args:
        filename: Filename to parse

    Returns:
        Dictionary with extracted metadata (experiment_id, cell_id, temp_C, rpt_id, etc.)

    Example:
        >>> meta = parse_filename_metadata("Expt 5 - cell A (10degC) - Processed Data.csv")
        >>> print(meta)  # {'experiment_id': 5, 'cell_id': 'A', 'temperature_C': 10}
    """
    meta = {}

    # Extract experiment ID
    expt_match = re.search(r'[Ee]xpt\s*(\d+)', filename)
    if expt_match:
        meta['experiment_id'] = int(expt_match.group(1))

    # Extract cell ID
    cell_match = re.search(r'cell\s*([A-Ha-h])', filename, re.IGNORECASE)
    if cell_match:
        meta['cell_id'] = cell_match.group(1).upper()

    # Extract temperature
    temp_match = re.search(r'(\d+)\s*deg[Cc]', filename)
    if temp_match:
        meta['temperature_C'] = int(temp_match.group(1))

    # Extract RPT index
    rpt_match = re.search(r'RPT\s*(\d+)', filename)
    if rpt_match:
        meta['rpt_id'] = int(rpt_match.group(1))

    # Extract curve type
    curve_match = re.search(r'(\d+\.?\d*)[Cc]', filename)
    if curve_match and 'deg' not in filename[max(0, filename.find(curve_match.group(0))-3):]:
        meta['curve_type'] = curve_match.group(1) + "C"

    # Extract direction
    if 'discharge' in filename.lower():
        meta['direction'] = 'discharge'
    elif 'charge' in filename.lower():
        meta['direction'] = 'charge'

    return meta

validate_data_structure

validate_data_structure(base_path: Path, experiment_id: int) -> Dict[str, Any]

Validate that expected data structure exists.

Parameters:

Name Type Description Default
base_path Path

Base path to data

required
experiment_id int

Experiment ID

required

Returns:

Type Description
Dict[str, Any]

Dictionary with validation results:

Dict[str, Any]
  • 'valid': bool
Dict[str, Any]
  • 'missing': List of missing paths
Dict[str, Any]
  • 'found': List of found paths
Dict[str, Any]
  • 'cells': List of cells with data
Source code in src/data/discovery.py
def validate_data_structure(base_path: Path, experiment_id: int) -> Dict[str, Any]:
    """Validate that expected data structure exists.

    Args:
        base_path: Base path to data
        experiment_id: Experiment ID

    Returns:
        Dictionary with validation results:
        - 'valid': bool
        - 'missing': List of missing paths
        - 'found': List of found paths
        - 'cells': List of cells with data
    """
    paths = ExperimentPaths(experiment_id, base_path)

    result = {
        'valid': True,
        'missing': [],
        'found': [],
        'cells': [],
    }

    # Check base directory
    if not paths.exists():
        result['valid'] = False
        result['missing'].append(str(paths.expt_path))
        return result

    result['found'].append(str(paths.expt_path))

    # Check for each expected cell
    temp_map = {
        5: {10: ['A', 'B', 'C'], 25: ['D', 'E'], 40: ['F', 'G', 'H']},
        # Add other experiments as needed
    }

    if experiment_id in temp_map:
        for temp_C, cells in temp_map[experiment_id].items():
            for cell_id in cells:
                path = paths.performance_summary(cell_id, temp_C)
                if path.exists():
                    result['found'].append(str(path))
                    if cell_id not in result['cells']:
                        result['cells'].append(cell_id)
                else:
                    result['missing'].append(str(path))

    result['valid'] = len(result['cells']) > 0

    return result