"""
FOX.io.read_xyz
===============
A module for reading multi-xyz files.
Index
-----
.. currentmodule:: FOX.io.read_xyz
.. autosummary::
XYZError
read_multi_xyz
get_comments
validate_xyz
_get_atom_count
_get_line_count
_get_idx_dict
API
---
.. autoexception:: FOX.io.read_xyz.XYZError
.. autofunction:: FOX.io.read_xyz.read_multi_xyz
.. autofunction:: FOX.io.read_xyz.get_comments
.. autofunction:: FOX.io.read_xyz.validate_xyz
.. autofunction:: FOX.io.read_xyz._get_atom_count
.. autofunction:: FOX.io.read_xyz._get_line_count
.. autofunction:: FOX.io.read_xyz._get_idx_dict
"""
import os
import reprlib
from typing import Tuple, Dict, Iterable, List, Union, Iterator, Generator, AnyStr
from itertools import islice, chain
import numpy as np
from ..functions.utils import group_by_values
__all__ = ['read_multi_xyz']
class XYZError(OSError):
"""Raise when there are issues related to parsing .xyz files."""
XYZoutput = Union[Tuple[np.ndarray, Dict[str, List[int]]],
Tuple[np.ndarray, Dict[str, List[int]], np.ndarray]]
[docs]def read_multi_xyz(filename: Union[AnyStr, os.PathLike], return_comment: bool = True) -> XYZoutput:
r"""Read a (multi) .xyz file.
Parameters
----------
filename : str
The path+filename of a (multi) .xyz file.
return_comment : bool
Whether or not the comment line in each Cartesian coordinate block should be returned.
Returned as a 1D array of strings.
Returns
-------
:math:`m*n*3` |np.ndarray|_ [|np.float64|_], |dict|_ [|str|_, |list|_ [|int|_]] and\
(optional) :math:`m` |np.ndarray|_ [|str|_]:
* A 3D array with Cartesian coordinates of :math:`m` molecules with :math:`n` atoms.
* A dictionary with atomic symbols as keys and lists of matching atomic indices as values.
* (Optional) a 1D array with :math:`m` comments.
Raises
------
:exc:`.XYZError`
Raised when issues are encountered related to parsing .xyz files.
"""
# Define constants and construct a dictionary: {atomic symbols: [atomic indices]}
with open(filename, 'r') as f:
atom_count = _get_atom_count(f)
idx_dict = _get_idx_dict(f, atom_count)
try:
line_count = _get_line_count(f, add=[2, atom_count])
except UnboundLocalError: # The .xyz file contains a single molecule
line_count = 2 + atom_count
# Check if mol_count is fractional, smaller than 1 or if atom_count is smaller than 1
mol_count = line_count / (2 + atom_count)
validate_xyz(mol_count, atom_count, filename)
# Create the to-be returned xyz array
shape = int(mol_count), atom_count, 3
with open(filename, 'r') as f:
iterator = chain.from_iterable(_xyz_generator(f, atom_count))
try:
xyz = np.fromiter(iterator, dtype=float, count=np.product(shape))
except ValueError as ex: # Failed to parse the .xyz file
raise XYZError(str(ex)).with_traceback(ex.__traceback__)
xyz.shape = shape # From 1D to 3D array
if return_comment:
return xyz, idx_dict, get_comments(filename, atom_count)
else:
return xyz, idx_dict
def _xyz_generator(f: Iterable[str], atom_count: int) -> Generator[Iterator[int], None, None]:
"""Create a Cartesian coordinate generator for :func:`.read_multi_xyz`."""
stop = 1 + atom_count
for _ in f:
yield chain.from_iterable(at.split()[1:] for at in islice(f, 1, stop))
def get_comments(filename: Union[AnyStr, os.PathLike], atom_count: int) -> np.ndarray:
"""Read and returns all comment lines in an xyz file.
A single comment line should be located under the atom count of each molecule.
Parameters
----------
filename : str
The path+filename of a (multi) .xyz file.
atom_count : int
The number of atoms per molecule.
Returns
-------
:math:`m` |np.ndarray|_ [|str|_]:
A 1D array with :math:`m` comments extracted from **filename**.
"""
step = 2 + atom_count
with open(filename, 'r') as f:
iterator = islice(f, 1, None, step) # Generator slicing
return np.array([i.rstrip() for i in iterator])
def validate_xyz(mol_count: float, atom_count: int, filename: str) -> None:
"""Validate **mol_count** and **atom_count** in **xyz_file**.
Parameters
----------
mol_count : float
The number of molecules in the xyz file.
Expects float that is finite with integral value
(*e.g.* :math:`5.0`, :math:`6.0` or :math:`3.0`).
atom_count : int
The number of atoms per molecule.
filename : str
The path + filename of a (multi) .xyz file.
Raises
------
:exc:`.XYZError`
Raised when issues are encountered related to parsing .xyz files.
"""
filename = f"'...{os.sep}{os.path.basename(filename)}'"
if not mol_count.is_integer():
raise XYZError(f"A non-integer number of molecules was found in '{filename}'; "
f"mol count: {mol_count}")
elif mol_count < 1.0:
raise XYZError(f"No molecules were found in '{filename}'; mol count: {mol_count}")
if atom_count < 1:
raise XYZError(f"No atoms were found in '{filename}'; atom count: {atom_count}")
def _get_atom_count(f: Iterator[str]) -> int:
"""Extract the number of atoms per molecule from the first line in an .xyz file.
Parameters
----------
f : |io.TextIOWrapper|_
An opened .xyz file.
Returns
-------
|int|_:
The number of atoms per molecule.
Raises
------
:exc:`.XYZError`
Raised when issues are encountered related to parsing .xyz files.
"""
ret = next(f)
try:
return int(ret)
except ValueError as ex:
err = (f"{reprlib.repr(ret)} is not a valid integer, the first line in an .xyz file "
"should contain the number of atoms per molecule")
raise XYZError(err).with_traceback(ex.__traceback__)
def _get_line_count(f: Iterable, add: Union[int, Iterable[int]] = 0) -> int:
"""Extract the total number lines from **f**.
Parameters
----------
f : |io.TextIOWrapper|_
An opened .xyz file.
add : int or |Iterable|_ [|int|_]
Add a constant to the to-be returned line count.
Returns
-------
|int|_:
The total number of lines in **f**.
"""
start = 1 + sum(add)
for i, _ in enumerate(f, start):
pass
return i
def _get_idx_dict(f: Iterable[str], atom_count: int) -> Dict[str, List[int]]:
"""Extract atomic symbols and matching atomic indices from **f**.
Parameters
----------
f : |io.TextIOWrapper|_
An opened .xyz file.
atom_count : int
The number of atoms per molecule.
Returns
-------
|dict|_ [|str|_, |list|_ [|int|_]]:
A dictionary with atomic symbols and a list of matching atomic indices.
"""
stop = 1 + atom_count
atom_list = [at.split(maxsplit=1)[0].capitalize() for at in islice(f, 1, stop)]
return group_by_values(enumerate(atom_list))