Source code for FOX.recipes.similarity

"""Recipes for computing the similarity between trajectories.

Examples
--------
An example where, starting from two .xyz files, the similarity
is computed between two molecular dynamics (MD) trajectories.

.. code-block:: python

    >>> import numpy as np
    >>> import FOX
    >>> from FOX.recipes import compare_trajectories

    # The relevant multi-xyz files
    >>> md_filename: str = ...
    >>> md = FOX.MultiMolecule.from_xyz(md_filename)
    >>> md_ref_filename: str = ...
    >>> md_ref = FOX.MultiMolecule.from_xyz(md_ref_filename)

    # Calculate the similarity between `md` and `md_ref`
    >>> similarity = compare_trajectories(md, md_ref, metric="cosine")

    # Identify all sufficiently dissimilar molecules (as defined via `threshold`)
    >>> threshold: float = ...
    >>> idx = np.zeros(len(md), dtype=np.bool_)
    >>> idx[similarity >= threshold] = True

The resulting indices can be used for, for example, identifying all molecules
one wants to use for further (quantum-mechanical/classical) calculations.

.. code-block:: python

    >>> import qmflows

    # Define the job settings
    >>> s = qmflows.Settings()
    >>> s.lattice = [50, 50, 50]
    >>> s.specific.cp2k.motion.print["forces on"].filename = ""
    >>> s.overlay(qmflows.templates.singlepoint)

    # Construct the job list
    >>> mol_list = md[idx].as_Molecule()
    >>> job_list = [qmflows.cp2k(s, mol) for mol in mol_list]

    # Run the jobs
    >>> result_list = [qmflows.run(job) for job in job_list]

    # Extract the forces and energies from all jobs
    >>> forces = np.array([r.forces for r in result_list])[:, 0]
    >>> energy = np.array([r.energy for r in result_list])[:, 0]


Index
-----
.. currentmodule:: FOX.recipes
.. autosummary::
    compare_trajectories
    fps_reduce

API
---
.. autofunction:: compare_trajectories
.. autofunction:: fps_reduce

"""

from typing import Optional
from itertools import islice
from functools import partial

import numpy as np
from FOX import MultiMolecule
from scipy.spatial.distance import cdist

try:
    from CAT.distribution import uniform_idx
    CAT_EX: Optional[ImportError] = None
except ImportError as ex:
    CAT_EX = ex

__all__ = ["compare_trajectories", "fps_reduce"]


def _parse_md(md, name, dtype=np.float64):
    md_ar = np.array(md, dtype=dtype, ndmin=3, copy=False, subok=True)
    if md_ar.ndim != 3:
        raise ValueError(f"`{name}` expected a <= 3D array; observed dimensionality: {md_ar.ndim}")
    elif not isinstance(md_ar, MultiMolecule):
        return md_ar.view(MultiMolecule)
    else:
        return md_ar


[docs]def compare_trajectories(md, md_ref, *, metric='cosine', reduce=np.mean,
                         reset_origin=True, **kwargs):
    r"""Compute the similarity between 2 trajectories according to the specified **metric**.

    The default **metric** aliases :func:`scipy.spatial.distance.cdist` for defining
    the (dis-)similarity between the passed **md** and its reference.
    This (dis-)similarity array is subsequently reduced to a vector of size
    :math:`(N_{mol},)` by taking its mean (along the relevant axes).

    Examples
    --------
    .. code-block:: python

        >>> import numpy as np
        >>> from FOX.recipes import compare_trajectories

        >>> md: np.ndarray = ...
        >>> md_ref: np.ndarray = ...

        # Default `metric` presets
        >>> metric1 = compare_trajectories(md, md_ref, metric="cosine")
        >>> metric2 = compare_trajectories(md, md_ref, metric="euclidean")
        >>> metric3 = compare_trajectories(md, md_ref, metric="minkowski", p=1)

        >>> def rmsd(a: np.ndarray) -> np.float64:
        ...     '''Calculate the root-mean-square deviation.'''
        ...     return np.mean(a**2)**0.5

        # Sum over the number of atoms rather than average
        >>> metric4 = compare_trajectories(md, md_ref, reduce=np.sum)
        >>> metric5 = compare_trajectories(md, md_ref, reduce=rmsd)

        >>> def sqeuclidean(md: np.ndarray, md_ref: np.ndarray) -> np.ndarray:
        ...     '''Calculate the distance based on the squared eclidian norm.'''
        ...     delta = md[..., None] - md_ref[..., None, :]
        ...     return np.linalg.norm(delta, axis=-1)**2

        # Pass a custom metric-function
        >>> metric6 = compare_trajectories(md, md_ref, metric=sqeuclidean)

    Parameters
    ----------
    md : :term:`numpy:array_like`, shape :math:`(N_{mol}, N_{atom1}, 3)` or :math:`(N_{atom1}, 3)`
        An array-like object containing the trajectory of interest.
    md_ref : :term:`numpy:array_like`, shape :math:`(N_{mol}, N_{atom2}, 3)` or :math:`(N_{atom2}, 3)`
        An array-like object containing the reference trajectory.
    metric : :class:`str` or :class:`Callable[[FOX.MultiMolecule, FOX.MultiMolecule], np.ndarray] <collections.abc.Callable>`
        The type of metric used for calculating the (dis-)similarity.
        Accepts either a callback or predefined alias. See **metric** parameter
        in :func:`scipy.spatial.distance.cdist` for a comprehensive overview of all aliases.
        If a callback is provided then it should take a array of shape :math:`(n_{atom1}, 3)`
        and :math:`(N_{atom2}, 3)` as arguments and return a new array of
        shape :math:`(N_{atom1}, N_{atom2})`.
    reduce : :class:`Callable[[np.ndarray], np.number] <collections.abc.Callable>`, optional
        A callable for performing a dimensional reduction.
        Used for transforming the shape :math:`(N_{atom1}, N_{atom2})` array,
        returned by **metric**, into a scalar.
        Setting this value to :data:`None` will disable the reduction and return the
        **metric** output in unaltered form.
    reset_origin : :class:`bool`
        Reset the origin by removing translations and rotations
        from the passed trajectories.
    \**kwargs : :data:`~typing.Any`
        Further keyword arguments for **metric**.

    Returns
    -------
    :class:`np.ndarray[np.float64] <numpy.ndarray>`, shape :math:`(N_{mol},)`
        An array with the (dis-)similarity between all molecules in **md** and **md_ref**.

    See Also
    --------
    :func:`scipy.spatial.distance.cdist`
        Compute distance between each pair of the two collections of inputs.

    """  # noqa: E501
    # Parse the inputs; ensure that they are a 3D array
    md_ar = _parse_md(md, "md")
    md_ref_ar = _parse_md(md_ref, "md_ref")
    if len(md_ar) != len(md_ref_ar):
        raise ValueError("`md` and `md_ref` should be of the same length")

    # Remove translations and rotations
    if reset_origin:
        md_ref_ar = md_ref_ar.reset_origin(inplace=False)
        md_ar = md_ar.reset_origin(inplace=False, rot_ref=md_ref_ar[0])

    # Parse the metric
    if callable(metric):
        func = metric
    elif isinstance(metric, str):
        func = partial(cdist, metric=metric)
    else:
        raise TypeError("`metric` expected a string or a callable; observed type: "
                        f"{metric.__class__.__name__}")

    # Compute the (dis-)similarity
    if reduce is None:
        return np.array([func(a, b, **kwargs) for a, b in zip(md_ar, md_ref_ar)])
    else:
        return np.array([reduce(func(a, b, **kwargs)) for a, b in zip(md_ar, md_ref_ar)])


[docs]def fps_reduce(dist_mat, n=1, **kwargs):
    r"""Return the indices that yield a uniform distribution of **n** points.

    Examples
    --------
    .. code-block:: python

        >>> from functools import partial
        >>> import numpy as np
        >>> from FOX.recipes import compare_trajectories, fps_reduce

        >>> md: np.ndarray = ...
        >>> md_ref: np.ndarray = ...

        >>> reduce_func = partial(fps_reduce, n=10)
        >>> out = compare_trajectories(md, md_ref, reduce=reduce_func)

    Note
    ----
    This function requires the Compound Attachment Tools package:
    `CAT <https://github.com/nlesc-nano/CAT>`_.

    Parameters
    ----------
    dist_mat : :class:`np.ndarray[np.float64] <numpy.ndarray>`, shape :math:`(m_a, m_b)`
        A distance matrix.
    n : :class:`int`, optional
        The number of to-be returned indices.
    \**kwargs : :data:`~typing.Any`
        Further keyword arguments for :func:`CAT.distribution.uniform_idx`.

    Returns
    -------
    :class:`np.ndarray[np.int64] <numpy.ndarray>`, shape :math:`(n,)`
        An array of indices.

    See Also
    --------
    :func:`CAT.distribution.uniform_idx`
        Yield the column-indices that result in a uniform or clustered distribution.
    :func:`FOX.recipes.compare_trajectories`
        Compute the similarity between 2 trajectories according to the specified **metric**.

    """
    if CAT_EX is not None:
        raise CAT_EX
    elif np.ndim(dist_mat) != 2:
        raise ValueError("`dist_mat` expected a 2D array")

    count = -1 if n is None else n
    iterator = islice(uniform_idx(dist_mat, **kwargs), None, n)
    return np.fromiter(iterator, dtype=np.intp, count=count)