Source code for FOX.io.read_xyz

"""
FOX.io.read_xyz
===============

A module for reading multi-xyz files.

Index
-----
.. currentmodule:: FOX.io.read_xyz
.. autosummary::
    XYZError
    read_multi_xyz
    get_comments
    validate_xyz
    _get_atom_count
    _get_line_count
    _get_idx_dict

API
---
.. autoexception:: FOX.io.read_xyz.XYZError
.. autofunction:: FOX.io.read_xyz.read_multi_xyz
.. autofunction:: FOX.io.read_xyz.get_comments
.. autofunction:: FOX.io.read_xyz.validate_xyz
.. autofunction:: FOX.io.read_xyz._get_atom_count
.. autofunction:: FOX.io.read_xyz._get_line_count
.. autofunction:: FOX.io.read_xyz._get_idx_dict

"""

import os
import reprlib
from typing import Tuple, Dict, Iterable, List, Union, Iterator, Generator, AnyStr
from itertools import islice, chain

import numpy as np

from ..functions.utils import group_by_values

__all__ = ['read_multi_xyz']


class XYZError(OSError):
    """Raise when there are issues related to parsing .xyz files."""


XYZoutput = Union[Tuple[np.ndarray, Dict[str, List[int]]],
                  Tuple[np.ndarray, Dict[str, List[int]], np.ndarray]]


[docs]def read_multi_xyz(filename: Union[AnyStr, os.PathLike], return_comment: bool = True) -> XYZoutput:
    r"""Read a (multi) .xyz file.

    Parameters
    ----------
    filename : str
        The path+filename of a (multi) .xyz file.

    return_comment : bool
        Whether or not the comment line in each Cartesian coordinate block should be returned.
        Returned as a 1D array of strings.

    Returns
    -------
    :math:`m*n*3` |np.ndarray|_ [|np.float64|_], |dict|_ [|str|_, |list|_ [|int|_]] and\
    (optional) :math:`m` |np.ndarray|_ [|str|_]:
        * A 3D array with Cartesian coordinates of :math:`m` molecules with :math:`n` atoms.
        * A dictionary with atomic symbols as keys and lists of matching atomic indices as values.
        * (Optional) a 1D array with :math:`m` comments.

    Raises
    ------
    :exc:`.XYZError`
        Raised when issues are encountered related to parsing .xyz files.

    """
    # Define constants and construct a dictionary: {atomic symbols: [atomic indices]}
    with open(filename, 'r') as f:
        atom_count = _get_atom_count(f)
        idx_dict = _get_idx_dict(f, atom_count)
        try:
            line_count = _get_line_count(f, add=[2, atom_count])
        except UnboundLocalError:  # The .xyz file contains a single molecule
            line_count = 2 + atom_count

    # Check if mol_count is fractional, smaller than 1 or if atom_count is smaller than 1
    mol_count = line_count / (2 + atom_count)
    validate_xyz(mol_count, atom_count, filename)

    # Create the to-be returned xyz array
    shape = int(mol_count), atom_count, 3
    with open(filename, 'r') as f:
        iterator = chain.from_iterable(_xyz_generator(f, atom_count))
        try:
            xyz = np.fromiter(iterator, dtype=float, count=np.product(shape))
        except ValueError as ex:  # Failed to parse the .xyz file
            raise XYZError(str(ex)).with_traceback(ex.__traceback__)
    xyz.shape = shape  # From 1D to 3D array

    if return_comment:
        return xyz, idx_dict, get_comments(filename, atom_count)
    else:
        return xyz, idx_dict


def _xyz_generator(f: Iterable[str], atom_count: int) -> Generator[Iterator[int], None, None]:
    """Create a Cartesian coordinate generator for :func:`.read_multi_xyz`."""
    stop = 1 + atom_count
    for _ in f:
        yield chain.from_iterable(at.split()[1:] for at in islice(f, 1, stop))


def get_comments(filename: Union[AnyStr, os.PathLike], atom_count: int) -> np.ndarray:
    """Read and returns all comment lines in an xyz file.

    A single comment line should be located under the atom count of each molecule.

    Parameters
    ----------
    filename : str
        The path+filename of a (multi) .xyz file.

    atom_count : int
        The number of atoms per molecule.

    Returns
    -------
    :math:`m` |np.ndarray|_ [|str|_]:
        A 1D array with :math:`m` comments extracted from **filename**.

    """
    step = 2 + atom_count
    with open(filename, 'r') as f:
        iterator = islice(f, 1, None, step)  # Generator slicing
        return np.array([i.rstrip() for i in iterator])


def validate_xyz(mol_count: float, atom_count: int, filename: str) -> None:
    """Validate **mol_count** and **atom_count** in **xyz_file**.

    Parameters
    ----------
    mol_count : float
        The number of molecules in the xyz file.
        Expects float that is finite with integral value
        (*e.g.* :math:`5.0`, :math:`6.0` or :math:`3.0`).

    atom_count : int
        The number of atoms per molecule.

    filename : str
        The path + filename of a (multi) .xyz file.

    Raises
    ------
    :exc:`.XYZError`
        Raised when issues are encountered related to parsing .xyz files.

    """
    filename = f"'...{os.sep}{os.path.basename(filename)}'"
    if not mol_count.is_integer():
        raise XYZError(f"A non-integer number of molecules was found in '{filename}'; "
                       f"mol count: {mol_count}")
    elif mol_count < 1.0:
        raise XYZError(f"No molecules were found in '{filename}'; mol count: {mol_count}")
    if atom_count < 1:
        raise XYZError(f"No atoms were found in '{filename}'; atom count: {atom_count}")


def _get_atom_count(f: Iterator[str]) -> int:
    """Extract the number of atoms per molecule from the first line in an .xyz file.

    Parameters
    ----------
    f : |io.TextIOWrapper|_
        An opened .xyz file.

    Returns
    -------
    |int|_:
        The number of atoms per molecule.

    Raises
    ------
    :exc:`.XYZError`
        Raised when issues are encountered related to parsing .xyz files.

    """
    ret = next(f)
    try:
        return int(ret)
    except ValueError as ex:
        err = (f"{reprlib.repr(ret)} is not a valid integer, the first line in an .xyz file "
               "should contain the number of atoms per molecule")
        raise XYZError(err).with_traceback(ex.__traceback__)


def _get_line_count(f: Iterable, add: Union[int, Iterable[int]] = 0) -> int:
    """Extract the total number lines from **f**.

    Parameters
    ----------
    f : |io.TextIOWrapper|_
        An opened .xyz file.

    add : int or |Iterable|_ [|int|_]
        Add a constant to the to-be returned line count.

    Returns
    -------
    |int|_:
        The total number of lines in **f**.

    """
    start = 1 + sum(add)
    for i, _ in enumerate(f, start):
        pass
    return i


def _get_idx_dict(f: Iterable[str], atom_count: int) -> Dict[str, List[int]]:
    """Extract atomic symbols and matching atomic indices from **f**.

    Parameters
    ----------
    f : |io.TextIOWrapper|_
        An opened .xyz file.

    atom_count : int
        The number of atoms per molecule.

    Returns
    -------
    |dict|_ [|str|_, |list|_ [|int|_]]:
        A dictionary with atomic symbols and a list of matching atomic indices.

    """
    stop = 1 + atom_count
    atom_list = [at.split(maxsplit=1)[0].capitalize() for at in islice(f, 1, stop)]
    return group_by_values(enumerate(atom_list))