Skip to content

Data API

edl_ml.data.features

Feature definitions and sampling utilities for the EDL dataset.

The ML surrogate is trained to reproduce the full differential capacitance curve C_dl(E) produced by the Gouy-Chapman-Stern solver. Each training sample is parameterised by five physical variables drawn from a Latin hypercube inside SamplingBounds. Concentration is sampled on a log-uniform grid because the diffuse-layer capacitance depends on :math:\sqrt{c} through the Debye length, giving poor coverage under a uniform scale.

FEATURE_COLUMNS module-attribute

FEATURE_COLUMNS: Final[tuple[str, ...]] = ('log10_concentration_mol_l', 'valence', 'temperature_k', 'stern_thickness_ang', 'stern_permittivity')

Ordered feature names stored in every dataset row.

TARGET_COLUMN module-attribute

TARGET_COLUMN: Final[str] = 'capacitance_uf_cm2'

Target variable name (differential capacitance).

SamplingBounds dataclass

Inclusive sampling bounds for the five input features.

Defaults bracket physically reasonable aqueous electrochemistry: 1 mM–1 M symmetric electrolyte, z=1 or 2, 283–343 K, Stern thickness 2.5–6 Å, Stern permittivity 5–15.

Source code in src/edl_ml/data/features.py
@dataclass(frozen=True, slots=True)
class SamplingBounds:
    """Inclusive sampling bounds for the five input features.

    Defaults bracket physically reasonable aqueous electrochemistry:
    1 mM–1 M symmetric electrolyte, z=1 or 2, 283–343 K, Stern thickness
    2.5–6 Å, Stern permittivity 5–15.
    """

    log10_concentration_min: float = -3.0
    log10_concentration_max: float = 0.0
    valence_choices: tuple[int, ...] = (1, 2)
    temperature_min_k: float = 283.15
    temperature_max_k: float = 343.15
    stern_thickness_min_ang: float = 2.5
    stern_thickness_max_ang: float = 6.0
    stern_permittivity_min: float = 5.0
    stern_permittivity_max: float = 15.0
    potential_min_v: float = -0.4
    potential_max_v: float = 0.4
    potential_n_points: int = 81

    def __post_init__(self) -> None:
        if self.log10_concentration_min >= self.log10_concentration_max:
            raise ValueError("invalid concentration range")
        if not self.valence_choices:
            raise ValueError("valence_choices must be non-empty")
        if self.temperature_min_k >= self.temperature_max_k:
            raise ValueError("invalid temperature range")
        if self.potential_n_points < 5:
            raise ValueError("potential_n_points too small")

latin_hypercube_samples

latin_hypercube_samples(bounds: SamplingBounds, n_samples: int, seed: int | None = 0) -> NDArray[np.float64]

Generate a Latin hypercube sample of feature vectors.

Parameters:

Name Type Description Default
bounds SamplingBounds

Sampling bounds object.

required
n_samples int

Number of samples to draw.

required
seed int | None

Random seed for reproducibility.

0

Returns:

Type Description
ndarray of shape ``(n_samples, 5)``

Columns in the order given by :data:FEATURE_COLUMNS. Valence is rounded to an element of bounds.valence_choices after uniform sampling on [0, 1].

Source code in src/edl_ml/data/features.py
def latin_hypercube_samples(
    bounds: SamplingBounds,
    n_samples: int,
    seed: int | None = 0,
) -> NDArray[np.float64]:
    """Generate a Latin hypercube sample of feature vectors.

    Parameters
    ----------
    bounds
        Sampling bounds object.
    n_samples
        Number of samples to draw.
    seed
        Random seed for reproducibility.

    Returns
    -------
    ndarray of shape ``(n_samples, 5)``
        Columns in the order given by :data:`FEATURE_COLUMNS`. Valence is
        rounded to an element of ``bounds.valence_choices`` after uniform
        sampling on ``[0, 1]``.
    """
    if n_samples < 1:
        raise ValueError("n_samples must be positive")

    sampler = qmc.LatinHypercube(d=5, seed=seed)
    unit = np.asarray(sampler.random(n=n_samples), dtype=np.float64)

    samples = np.zeros_like(unit)
    samples[:, 0] = bounds.log10_concentration_min + unit[:, 0] * (
        bounds.log10_concentration_max - bounds.log10_concentration_min
    )
    idx = np.clip(
        np.floor(unit[:, 1] * len(bounds.valence_choices)).astype(int),
        0,
        len(bounds.valence_choices) - 1,
    )
    samples[:, 1] = np.asarray(bounds.valence_choices, dtype=float)[idx]
    samples[:, 2] = bounds.temperature_min_k + unit[:, 2] * (
        bounds.temperature_max_k - bounds.temperature_min_k
    )
    samples[:, 3] = bounds.stern_thickness_min_ang + unit[:, 3] * (
        bounds.stern_thickness_max_ang - bounds.stern_thickness_min_ang
    )
    samples[:, 4] = bounds.stern_permittivity_min + unit[:, 4] * (
        bounds.stern_permittivity_max - bounds.stern_permittivity_min
    )
    return np.asarray(samples, dtype=np.float64)

edl_ml.data.generate

High-throughput dataset generation driven by the Gouy-Chapman-Stern solver.

SweepResult dataclass

Outputs of a single Gouy-Chapman-Stern sweep over electrode potential.

Attributes:

Name Type Description
features NDArray[float64]

Length-5 feature vector, matching :data:FEATURE_COLUMNS.

potentials_v NDArray[float64]

Electrode potentials, V.

capacitance_f_m2 NDArray[float64]

Total differential capacitance at each potential, F/m².

surface_charge_c_m2 NDArray[float64]

Diffuse-layer surface charge, C/m².

Source code in src/edl_ml/data/generate.py
@dataclass(frozen=True, slots=True)
class SweepResult:
    """Outputs of a single Gouy-Chapman-Stern sweep over electrode potential.

    Attributes
    ----------
    features
        Length-5 feature vector, matching :data:`FEATURE_COLUMNS`.
    potentials_v
        Electrode potentials, V.
    capacitance_f_m2
        Total differential capacitance at each potential, F/m².
    surface_charge_c_m2
        Diffuse-layer surface charge, C/m².
    """

    features: NDArray[np.float64]
    potentials_v: NDArray[np.float64]
    capacitance_f_m2: NDArray[np.float64]
    surface_charge_c_m2: NDArray[np.float64]

build_capacitance_dataset

build_capacitance_dataset(bounds: SamplingBounds, n_samples: int, *, seed: int | None = 0, parallel: bool = True, max_workers: int | None = None) -> pd.DataFrame

Build a tidy dataset of capacitance values for ML training.

The returned DataFrame is long-format: every row represents one (features, electrode_potential) pair with the corresponding total differential capacitance. This layout is convenient for scikit-learn and torch dataset consumers.

Parameters:

Name Type Description Default
bounds SamplingBounds

Sampling bounds object.

required
n_samples int

Number of Latin hypercube samples.

required
seed int | None

Random seed.

0
parallel bool

Whether to run sweeps in a process pool.

True
max_workers int | None

Process pool size. None uses the default.

None

Returns:

Type Description
DataFrame with columns:

log10_concentration_mol_l, valence, temperature_k, stern_thickness_ang, stern_permittivity, electrode_potential_v, capacitance_uf_cm2, surface_charge_uc_cm2.

Source code in src/edl_ml/data/generate.py
def build_capacitance_dataset(
    bounds: SamplingBounds,
    n_samples: int,
    *,
    seed: int | None = 0,
    parallel: bool = True,
    max_workers: int | None = None,
) -> pd.DataFrame:
    """Build a tidy dataset of capacitance values for ML training.

    The returned DataFrame is long-format: every row represents one
    ``(features, electrode_potential)`` pair with the corresponding total
    differential capacitance. This layout is convenient for scikit-learn and
    torch dataset consumers.

    Parameters
    ----------
    bounds
        Sampling bounds object.
    n_samples
        Number of Latin hypercube samples.
    seed
        Random seed.
    parallel
        Whether to run sweeps in a process pool.
    max_workers
        Process pool size. ``None`` uses the default.

    Returns
    -------
    DataFrame with columns:
        ``log10_concentration_mol_l``, ``valence``, ``temperature_k``,
        ``stern_thickness_ang``, ``stern_permittivity``,
        ``electrode_potential_v``, ``capacitance_uf_cm2``,
        ``surface_charge_uc_cm2``.
    """
    samples = latin_hypercube_samples(bounds, n_samples, seed=seed)
    potentials = np.linspace(
        bounds.potential_min_v,
        bounds.potential_max_v,
        bounds.potential_n_points,
    )

    rows: list[dict[str, float]] = []
    if parallel and n_samples > 1:
        with ProcessPoolExecutor(max_workers=max_workers) as pool:
            futures = [pool.submit(_sweep_worker, sample, potentials) for sample in samples]
            results = [f.result() for f in as_completed(futures)]
    else:
        results = [_sweep_worker(sample, potentials) for sample in samples]

    for features, cap, sigma in results:
        for e, c, s in zip(potentials, cap, sigma, strict=True):
            row: dict[str, float] = dict(zip(FEATURE_COLUMNS, features, strict=True))
            row["electrode_potential_v"] = float(e)
            row["capacitance_uf_cm2"] = float(c) * 100.0  # F/m² → µF/cm²
            row["surface_charge_uc_cm2"] = float(s) * 100.0
            rows.append(row)
    df = pd.DataFrame(rows)
    return df.sort_values(list(FEATURE_COLUMNS) + ["electrode_potential_v"]).reset_index(drop=True)

run_single_sweep

run_single_sweep(feature_vector: NDArray[float64], potentials_v: NDArray[float64]) -> SweepResult

Run the GCS solver for one feature vector over an electrode potential grid.

Parameters:

Name Type Description Default
feature_vector NDArray[float64]

Five-element array matching :data:FEATURE_COLUMNS.

required
potentials_v NDArray[float64]

Electrode potentials to sweep, V.

required

Returns:

Type Description
SweepResult
Source code in src/edl_ml/data/generate.py
def run_single_sweep(
    feature_vector: NDArray[np.float64],
    potentials_v: NDArray[np.float64],
) -> SweepResult:
    """Run the GCS solver for one feature vector over an electrode potential grid.

    Parameters
    ----------
    feature_vector
        Five-element array matching :data:`FEATURE_COLUMNS`.
    potentials_v
        Electrode potentials to sweep, V.

    Returns
    -------
    SweepResult
    """
    (
        log10_c,
        valence,
        temperature,
        stern_thickness_ang,
        stern_permittivity,
    ) = feature_vector
    params = GCSParameters(
        concentration_mol_l=float(10.0**log10_c),
        valence=int(round(valence)),
        temperature_k=float(temperature),
        stern_thickness_m=float(stern_thickness_ang) * 1e-10,
        stern_permittivity=float(stern_permittivity),
    )
    sigma, _, cap = gouy_chapman_stern(params, potentials_v)
    return SweepResult(
        features=np.asarray(feature_vector, dtype=np.float64),
        potentials_v=potentials_v,
        capacitance_f_m2=cap,
        surface_charge_c_m2=sigma,
    )

save_dataset

save_dataset(df: DataFrame, path: Path | str) -> None

Save a dataset to a parquet file, creating parent directories as needed.

Source code in src/edl_ml/data/generate.py
def save_dataset(df: pd.DataFrame, path: Path | str) -> None:
    """Save a dataset to a parquet file, creating parent directories as needed."""
    out = Path(path)
    out.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(out, index=False)

load_dataset

load_dataset(path: Path | str) -> pd.DataFrame

Load a dataset previously produced by :func:build_capacitance_dataset.

Source code in src/edl_ml/data/generate.py
def load_dataset(path: Path | str) -> pd.DataFrame:
    """Load a dataset previously produced by :func:`build_capacitance_dataset`."""
    return pd.read_parquet(path)

split_by_sample

split_by_sample(df: DataFrame, val_fraction: float = 0.15, test_fraction: float = 0.15, seed: int | None = 0) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]

Split the dataset so that every sweep is entirely in one split.

Splitting at the sweep level (rather than the row level) prevents information leakage between train and test capacitance curves that share the same physical parameters.

Parameters:

Name Type Description Default
df DataFrame

Output of :func:build_capacitance_dataset.

required
val_fraction float

Fractions in (0, 1). Their sum must be strictly below 1.

0.15
test_fraction float

Fractions in (0, 1). Their sum must be strictly below 1.

0.15
seed int | None

RNG seed.

0

Returns:

Type Description
tuple

(train_df, val_df, test_df).

Source code in src/edl_ml/data/generate.py
def split_by_sample(
    df: pd.DataFrame,
    val_fraction: float = 0.15,
    test_fraction: float = 0.15,
    seed: int | None = 0,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Split the dataset so that every sweep is entirely in one split.

    Splitting at the sweep level (rather than the row level) prevents
    information leakage between train and test capacitance curves that share
    the same physical parameters.

    Parameters
    ----------
    df
        Output of :func:`build_capacitance_dataset`.
    val_fraction, test_fraction
        Fractions in (0, 1). Their sum must be strictly below 1.
    seed
        RNG seed.

    Returns
    -------
    tuple
        ``(train_df, val_df, test_df)``.
    """
    if not 0 < val_fraction < 1 or not 0 < test_fraction < 1:
        raise ValueError("fractions must lie in (0, 1)")
    if val_fraction + test_fraction >= 1:
        raise ValueError("val + test fractions must be < 1")

    unique_samples = df.drop_duplicates(subset=list(FEATURE_COLUMNS))[
        list(FEATURE_COLUMNS)
    ].to_numpy()
    rng = np.random.default_rng(seed)
    order = rng.permutation(len(unique_samples))
    n_val = int(round(val_fraction * len(unique_samples)))
    n_test = int(round(test_fraction * len(unique_samples)))

    test_idx = set(map(int, order[:n_test]))
    val_idx = set(map(int, order[n_test : n_test + n_val]))

    train_rows: list[int] = []
    val_rows: list[int] = []
    test_rows: list[int] = []
    key_to_idx = {tuple(row): i for i, row in enumerate(unique_samples)}
    for i, row in enumerate(df[list(FEATURE_COLUMNS)].to_numpy()):
        sample_idx = key_to_idx[tuple(row)]
        if sample_idx in test_idx:
            test_rows.append(i)
        elif sample_idx in val_idx:
            val_rows.append(i)
        else:
            train_rows.append(i)
    return (
        df.iloc[train_rows].reset_index(drop=True),
        df.iloc[val_rows].reset_index(drop=True),
        df.iloc[test_rows].reset_index(drop=True),
    )

summarise_dataset

summarise_dataset(df: DataFrame) -> dict[str, float]

Return simple summary statistics for logging.

Returns:

Type Description
dict

Keys: n_rows, n_unique_samples, cap_mean, cap_std, cap_min, cap_max.

Source code in src/edl_ml/data/generate.py
def summarise_dataset(df: pd.DataFrame) -> dict[str, float]:
    """Return simple summary statistics for logging.

    Returns
    -------
    dict
        Keys: ``n_rows``, ``n_unique_samples``, ``cap_mean``, ``cap_std``,
        ``cap_min``, ``cap_max``.
    """
    unique_samples = df.drop_duplicates(subset=list(FEATURE_COLUMNS))
    return {
        "n_rows": len(df),
        "n_unique_samples": len(unique_samples),
        "cap_mean": float(df["capacitance_uf_cm2"].mean()),
        "cap_std": float(df["capacitance_uf_cm2"].std()),
        "cap_min": float(df["capacitance_uf_cm2"].min()),
        "cap_max": float(df["capacitance_uf_cm2"].max()),
    }