Source code for FOX.recipes.similarity

"""Recipes for computing the similarity between trajectories.

Examples
--------
An example where, starting from two .xyz files, the similarity
is computed between two molecular dynamics (MD) trajectories.

.. code-block:: python

    >>> import numpy as np
    >>> import FOX
    >>> from FOX.recipes import compare_trajectories

    # The relevant multi-xyz files
    >>> md_filename: str = ...
    >>> md = FOX.MultiMolecule.from_xyz(md_filename)
    >>> md_ref_filename: str = ...
    >>> md_ref = FOX.MultiMolecule.from_xyz(md_ref_filename)

    # Calculate the similarity between `md` and `md_ref`
    >>> similarity = compare_trajectories(md, md_ref, metric="cosine")

    # Identify all sufficiently dissimilar molecules (as defined via `threshold`)
    >>> threshold: float = ...
    >>> idx = np.zeros(len(md), dtype=np.bool_)
    >>> idx[similarity >= threshold] = True

The resulting indices can be used for, for example, identifying all molecules
one wants to use for further (quantum-mechanical/classical) calculations.

.. code-block:: python

    >>> import qmflows

    # Define the job settings
    >>> s = qmflows.Settings()
    >>> s.lattice = [50, 50, 50]
    >>> s.specific.cp2k.motion.print["forces on"].filename = ""
    >>> s.overlay(qmflows.templates.singlepoint)

    # Construct the job list
    >>> mol_list = md[idx].as_Molecule()
    >>> job_list = [qmflows.cp2k(s, mol) for mol in mol_list]

    # Run the jobs
    >>> result_list = [qmflows.run(job) for job in job_list]

    # Extract the forces and energies from all jobs
    >>> forces = np.array([r.forces for r in result_list])[:, 0]
    >>> energy = np.array([r.energy for r in result_list])[:, 0]


Index
-----
.. currentmodule:: FOX.recipes
.. autosummary::
    compare_trajectories
    fps_reduce

API
---
.. autofunction:: compare_trajectories
.. autofunction:: fps_reduce

"""

from typing import Optional
from itertools import islice
from functools import partial

import numpy as np
from FOX import MultiMolecule
from scipy.spatial.distance import cdist

try:
    from CAT.distribution import uniform_idx
    CAT_EX: Optional[ImportError] = None
except ImportError as ex:
    CAT_EX = ex

__all__ = ["compare_trajectories", "fps_reduce"]


def _parse_md(md, name, dtype=np.float64):
    md_ar = np.array(md, dtype=dtype, ndmin=3, copy=False, subok=True)
    if md_ar.ndim != 3:
        raise ValueError(f"`{name}` expected a <= 3D array; observed dimensionality: {md_ar.ndim}")
    elif not isinstance(md_ar, MultiMolecule):
        return md_ar.view(MultiMolecule)
    else:
        return md_ar


[docs]def compare_trajectories(md, md_ref, *, metric='cosine', reduce=np.mean, reset_origin=True, **kwargs): r"""Compute the similarity between 2 trajectories according to the specified **metric**. The default **metric** aliases :func:`scipy.spatial.distance.cdist` for defining the (dis-)similarity between the passed **md** and its reference. This (dis-)similarity array is subsequently reduced to a vector of size :math:`(N_{mol},)` by taking its mean (along the relevant axes). Examples -------- .. code-block:: python >>> import numpy as np >>> from FOX.recipes import compare_trajectories >>> md: np.ndarray = ... >>> md_ref: np.ndarray = ... # Default `metric` presets >>> metric1 = compare_trajectories(md, md_ref, metric="cosine") >>> metric2 = compare_trajectories(md, md_ref, metric="euclidean") >>> metric3 = compare_trajectories(md, md_ref, metric="minkowski", p=1) >>> def rmsd(a: np.ndarray) -> np.float64: ... '''Calculate the root-mean-square deviation.''' ... return np.mean(a**2)**0.5 # Sum over the number of atoms rather than average >>> metric4 = compare_trajectories(md, md_ref, reduce=np.sum) >>> metric5 = compare_trajectories(md, md_ref, reduce=rmsd) >>> def sqeuclidean(md: np.ndarray, md_ref: np.ndarray) -> np.ndarray: ... '''Calculate the distance based on the squared eclidian norm.''' ... delta = md[..., None] - md_ref[..., None, :] ... return np.linalg.norm(delta, axis=-1)**2 # Pass a custom metric-function >>> metric6 = compare_trajectories(md, md_ref, metric=sqeuclidean) Parameters ---------- md : :term:`numpy:array_like`, shape :math:`(N_{mol}, N_{atom1}, 3)` or :math:`(N_{atom1}, 3)` An array-like object containing the trajectory of interest. md_ref : :term:`numpy:array_like`, shape :math:`(N_{mol}, N_{atom2}, 3)` or :math:`(N_{atom2}, 3)` An array-like object containing the reference trajectory. metric : :class:`str` or :class:`Callable[[FOX.MultiMolecule, FOX.MultiMolecule], np.ndarray] <collections.abc.Callable>` The type of metric used for calculating the (dis-)similarity. Accepts either a callback or predefined alias. See **metric** parameter in :func:`scipy.spatial.distance.cdist` for a comprehensive overview of all aliases. If a callback is provided then it should take a array of shape :math:`(n_{atom1}, 3)` and :math:`(N_{atom2}, 3)` as arguments and return a new array of shape :math:`(N_{atom1}, N_{atom2})`. reduce : :class:`Callable[[np.ndarray], np.number] <collections.abc.Callable>`, optional A callable for performing a dimensional reduction. Used for transforming the shape :math:`(N_{atom1}, N_{atom2})` array, returned by **metric**, into a scalar. Setting this value to :data:`None` will disable the reduction and return the **metric** output in unaltered form. reset_origin : :class:`bool` Reset the origin by removing translations and rotations from the passed trajectories. \**kwargs : :data:`~typing.Any` Further keyword arguments for **metric**. Returns ------- :class:`np.ndarray[np.float64] <numpy.ndarray>`, shape :math:`(N_{mol},)` An array with the (dis-)similarity between all molecules in **md** and **md_ref**. See Also -------- :func:`scipy.spatial.distance.cdist` Compute distance between each pair of the two collections of inputs. """ # noqa: E501 # Parse the inputs; ensure that they are a 3D array md_ar = _parse_md(md, "md") md_ref_ar = _parse_md(md_ref, "md_ref") if len(md_ar) != len(md_ref_ar): raise ValueError("`md` and `md_ref` should be of the same length") # Remove translations and rotations if reset_origin: md_ref_ar = md_ref_ar.reset_origin(inplace=False) md_ar = md_ar.reset_origin(inplace=False, rot_ref=md_ref_ar[0]) # Parse the metric if callable(metric): func = metric elif isinstance(metric, str): func = partial(cdist, metric=metric) else: raise TypeError("`metric` expected a string or a callable; observed type: " f"{metric.__class__.__name__}") # Compute the (dis-)similarity if reduce is None: return np.array([func(a, b, **kwargs) for a, b in zip(md_ar, md_ref_ar)]) else: return np.array([reduce(func(a, b, **kwargs)) for a, b in zip(md_ar, md_ref_ar)])
[docs]def fps_reduce(dist_mat, n=1, **kwargs): r"""Return the indices that yield a uniform distribution of **n** points. Examples -------- .. code-block:: python >>> from functools import partial >>> import numpy as np >>> from FOX.recipes import compare_trajectories, fps_reduce >>> md: np.ndarray = ... >>> md_ref: np.ndarray = ... >>> reduce_func = partial(fps_reduce, n=10) >>> out = compare_trajectories(md, md_ref, reduce=reduce_func) Note ---- This function requires the Compound Attachment Tools package: `CAT <https://github.com/nlesc-nano/CAT>`_. Parameters ---------- dist_mat : :class:`np.ndarray[np.float64] <numpy.ndarray>`, shape :math:`(m_a, m_b)` A distance matrix. n : :class:`int`, optional The number of to-be returned indices. \**kwargs : :data:`~typing.Any` Further keyword arguments for :func:`CAT.distribution.uniform_idx`. Returns ------- :class:`np.ndarray[np.int64] <numpy.ndarray>`, shape :math:`(n,)` An array of indices. See Also -------- :func:`CAT.distribution.uniform_idx` Yield the column-indices that result in a uniform or clustered distribution. :func:`FOX.recipes.compare_trajectories` Compute the similarity between 2 trajectories according to the specified **metric**. """ if CAT_EX is not None: raise CAT_EX elif np.ndim(dist_mat) != 2: raise ValueError("`dist_mat` expected a 2D array") count = -1 if n is None else n iterator = islice(uniform_idx(dist_mat, **kwargs), None, n) return np.fromiter(iterator, dtype=np.intp, count=count)