Source code for FOX.io.read_xyz

"""A module for reading multi-xyz files.

Index
-----
.. currentmodule:: FOX.io.read_xyz
.. autosummary::
    XYZError
    read_multi_xyz
    get_comments
    validate_xyz

API
---
.. autoexception:: XYZError
.. autofunction:: read_multi_xyz
.. autofunction:: get_comments
.. autofunction:: validate_xyz

"""

import reprlib
from typing import Tuple, Dict, Iterable, List, Iterator, Generator, overload
from itertools import islice, chain

import numpy as np
from scm.plams import Units
from nanoutils import Literal, PathType, group_by_values

__all__ = ['read_multi_xyz']


class XYZError(OSError):
    """Raise when there are issues related to parsing .xyz files."""


XYZoutput1 = Tuple[np.ndarray, Dict[str, List[int]]]
XYZoutput2 = Tuple[np.ndarray, Dict[str, List[int]], np.ndarray]


@overload
def read_multi_xyz(filename: PathType, return_comment: Literal[True] = ..., unit: str = ...) -> XYZoutput2: ...  # noqa: E501
@overload
def read_multi_xyz(filename: PathType, return_comment: Literal[False], unit: str = ...) -> XYZoutput1: ...  # noqa: E501
[docs]def read_multi_xyz(filename, return_comment=True, unit='angstrom'):  # noqa: E302
    r"""Read a (multi) .xyz file.

    Parameters
    ----------
    filename : str
        The path+filename of a (multi) .xyz file.
    return_comment : bool
        Whether or not the comment line in each Cartesian coordinate block should be returned.
        Returned as a 1D array of strings.
    unit : :class:`str`
        The unit of the to-be returned array.

    Returns
    -------
    :math:`m*n*3` |np.ndarray|_ [|np.float64|_], |dict|_ [|str|_, |list|_ [|int|_]] and\
    (optional) :math:`m` |np.ndarray|_ [|str|_]:
        * A 3D array with Cartesian coordinates of :math:`m` molecules with :math:`n` atoms.
        * A dictionary with atomic symbols as keys and lists of matching atomic indices as values.
        * (Optional) a 1D array with :math:`m` comments.

    Raises
    ------
    :exc:`.XYZError`
        Raised when issues are encountered related to parsing .xyz files.

    """
    # Define constants and construct a dictionary: {atomic symbols: [atomic indices]}
    with open(filename, 'r') as f:
        atom_count = _get_atom_count(f)
        idx_dict = _get_idx_dict(f, atom_count)

    with open(filename, 'r') as f:
        iterator = chain.from_iterable(_xyz_generator(f, atom_count))
        try:
            xyz = np.fromiter(iterator, dtype=np.float64)
        except ValueError as ex:  # Failed to parse the .xyz file
            raise XYZError("Failed to parse the passed xyz file") from ex
    xyz.shape = (-1, atom_count, 3)

    if unit != 'angstrom':
        xyz *= Units.conversion_ratio('angstrom', unit)

    if return_comment:
        return xyz, idx_dict, get_comments(filename, atom_count)
    else:
        return xyz, idx_dict


def _xyz_generator(f: Iterable[str], atom_count: int) -> Generator[Iterator[str], None, None]:
    """Create a Cartesian coordinate generator for :func:`.read_multi_xyz`."""
    stop = 1 + atom_count
    for at_count in f:
        # Allow for empty lines between xyz blocks
        if not at_count.strip():
            continue
        yield chain.from_iterable(at.split()[1:] for at in islice(f, 1, stop))


def get_comments(filename: PathType, atom_count: int) -> np.ndarray:
    """Read and returns all comment lines in an xyz file.

    A single comment line should be located under the atom count of each molecule.

    Parameters
    ----------
    filename : str
        The path+filename of a (multi) .xyz file.

    atom_count : int
        The number of atoms per molecule.

    Returns
    -------
    :math:`m` |np.ndarray|_ [|str|_]:
        A 1D array with :math:`m` comments extracted from **filename**.

    """
    step = 2 + atom_count
    with open(filename, 'r') as f:
        iterator = islice(f, 1, None, step)  # Generator slicing
        return np.array([i.rstrip() for i in iterator])


def _get_atom_count(f: Iterator[str]) -> int:
    """Extract the number of atoms per molecule from the first line in an .xyz file.

    Parameters
    ----------
    f : |io.TextIOWrapper|_
        An opened .xyz file.

    Returns
    -------
    |int|_:
        The number of atoms per molecule.

    Raises
    ------
    :exc:`.XYZError`
        Raised when issues are encountered related to parsing .xyz files.

    """
    ret = next(f)
    try:
        return int(ret)
    except ValueError as ex:
        err = (f"{reprlib.repr(ret)} is not a valid integer, the first line in an .xyz file "
               "should contain the number of atoms per molecule")
        raise XYZError(err) from ex


def _get_idx_dict(f: Iterable[str], atom_count: int) -> Dict[str, List[int]]:
    """Extract atomic symbols and matching atomic indices from **f**.

    Parameters
    ----------
    f : |io.TextIOWrapper|_
        An opened .xyz file.

    atom_count : int
        The number of atoms per molecule.

    Returns
    -------
    |dict|_ [|str|_, |list|_ [|int|_]]:
        A dictionary with atomic symbols and a list of matching atomic indices.

    """
    stop = 1 + atom_count
    atom_list = [at.split(maxsplit=1)[0].capitalize() for at in islice(f, 1, stop)]
    return group_by_values(enumerate(atom_list))