Source code for FOX.recipes.psf

"""
FOX.recipes.psf
=================

A set of functions for creating .psf files.

Examples
--------
Example code for generating a .psf file.
Ligand atoms within the ligand .xyz file and the qd .xyz file should be in the *exact* same order.
For example, implicit hydrogen atoms added by the
:func:`from_smiles<scm.plams.interfaces.molecule.rdkit.from_smiles>` functions are not guaranteed
to be ordered, even when using canonical SMILES strings.

.. code:: python

    >>> from scm.plams import Molecule, from_smiles
    >>> from FOX import PSFContainer
    >>> from FOX.recipes import generate_psf

    # Accepts .xyz, .pdb, .mol or .mol2 files
    >>> qd = Molecule(...)
    >>> ligand: Molecule = Molecule(...)
    >>> rtf_file : str = ...
    >>> psf_file : str = ...

    >>> psf: PSFContainer = generate_psf(qd_xyz, ligand_xyz, rtf_file=rtf_file)
    >>> psf.write(psf_file)


Examples
--------
If no ligand .xyz is on hand, or its atoms are in the wrong order, it is possible the
extract the ligand directly from the quantum dot.
This is demonstrated below with oleate (:math:`C_{18} H_{33} O_{2}^{-}`).

.. code:: python

    >>> from scm.plams import Molecule
    >>> from FOX import PSFContainer
    >>> from FOX.recipes import generate_psf, extract_ligand

    >>> qd = Molecule(...)  # Accepts an .xyz, .pdb, .mol or .mol2 file
    >>> rtf_file : str = ...

    >>> ligand_len = 18 + 33 + 2
    >>> ligand_atoms = {'C', 'H', 'O'}
    >>> ligand: Molecule = extract_ligand(qd, ligand_len, ligand_atoms)

    >>> psf: PSFContainer = generate_psf(qd, ligand, rtf_file=rtf_file)
    >>> psf.write(...)


Examples
--------
Example for multiple ligands.

.. code:: python

    >>> from typing import List
    >>> from scm.plams import Molecule
    >>> from FOX import PSFContainer
    >>> from FOX.recipes import generate_psf2

    >>> qd = Molecule(...)  # Accepts an .xyz, .pdb, .mol or .mol2 file
    >>> ligands = ('C[O-]', 'CC[O-]', 'CCC[O-]')
    >>> rtf_files = (..., ..., ...)

    >>> psf: PSFContainer = generate_psf2(qd, *ligands, rtf_file=rtf_files)
    >>> psf.write(...)

If the the psf construction with :func:`generate_psf2` failes to identify a particular ligand,
it is possible to return all (failed) potential ligands with the **ret_failed_lig** parameter.

.. code:: python

    >>> ...

    >>> ligands = ('CCCCCCCCC[O-]', 'CCCCBr')
    >>> failed_mol_list: List[Molecule] = generate_psf2(qd, *ligands, ret_failed_lig=True)


Index
-----
.. currentmodule:: FOX.recipes.psf
.. autosummary::
    generate_psf
    generate_psf2
    extract_ligand

API
---
.. autofunction:: generate_psf
.. autofunction:: generate_psf2
.. autofunction:: extract_ligand

"""
import math
import warnings
from os import PathLike
from sys import version_info
from types import MappingProxyType
from typing import (Union, Iterable, Optional, TypeVar, Callable, Mapping, Type, Iterator,
                    Hashable, Any, Tuple, MutableMapping, AnyStr, List, SupportsFloat)
from itertools import chain
from collections import abc

import numpy as np
from scm.plams import Molecule, Atom, Bond, MoleculeError, PT

from FOX import PSFContainer, assert_error
from FOX.io.read_psf import overlay_rtf_file, overlay_str_file
from FOX.functions.utils import group_by_values
from FOX.functions.molecule_utils import fix_bond_orders
from FOX.armc_functions.sanitization import _assign_residues

if version_info.minor < 7:
    from collections import OrderedDict
else:  # Dictionaries are ordered starting from python 3.7
    OrderedDict = dict

try:
    from rdkit import Chem
    from scm.plams import from_smiles, to_rdmol

    Mol: Union[str, type] = Chem.Mol
    RDKIT_ERROR = None

    # A somewhat contrived way of loading :exc:`ArgumentError<Boost.Python.ArgumentError>`
    _MOL = Molecule()
    _MOL.atoms = [Atom(symbol='H', coords=[0, 0, 0], mol=_MOL)]
    _MOL[1].properties.charge = -0.5
    try:
        to_rdmol(_MOL)
    except Exception as ex:
        ArgumentError: Optional[Type[Exception]] = type(ex)
    del _MOL

except ImportError:
    Mol: Union[str, type] = 'rdkit.Chem.rdchem.Mol'
    ArgumentError: Optional[Type[Exception]] = None
    RDKIT_ERROR = ("Use of the FOX.{} function requires the 'rdkit' package."
                   "\n'rdkit' can be installed via conda with the following command:"
                   "\n\tconda install -n FOX -c conda-forge rdkit")


__all__ = ['generate_psf', 'generate_psf2', 'extract_ligand']


[docs]def generate_psf(qd: Union[str, Molecule], ligand: Union[str, Molecule], rtf_file: Optional[str] = None, str_file: Optional[str] = None) -> PSFContainer: """Generate a :class:`PSFContainer` instance for **qd**. Parameters ---------- qd : :class:`str` or :class:`Molecule` The ligand-pacifated quantum dot. Should be supplied as either a Molecule or .xyz file. ligand : :class:`str` or :class:`Molecule` A single ligand. Should be supplied as either a Molecule or .xyz file. rtf_file : :class:`str`, optional The path+filename of the ligand's .rtf file. Used for assigning atom types. Alternativelly, one can supply a .str file with the **str_file** argument. str_file : :class:`str`, optional The path+filename of the ligand's .str file. Used for assigning atom types. Alternativelly, one can supply a .rtf file with the **rtf_file** argument. Returns ------- :class:`PSFContainer` A PSFContainer instance with the new .psf file. """ if not isinstance(qd, Molecule): qd = Molecule(qd) if not isinstance(ligand, Molecule): ligand = Molecule(ligand) # Find the start of the ligand atnum = ligand[1].atnum for ligand_start, at in enumerate(qd): if at.atnum == atnum: break else: raise MoleculeError(f'No atom {ligand[1].symbol} found within {qd.get_formula()}') # Create an array with atomic-indice pairs defining bonds ligand.set_atoms_id() bonds = np.array([(b.atom1.id, b.atom2.id) for b in ligand.bonds]) bonds += ligand_start ligand.unset_atoms_id() # Manually add bonds to the quantum dot ligand_len = len(ligand) qd.delete_all_bonds() while True: try: qd[bonds[0, 0]] except IndexError: break else: for j, k in bonds: at1, at2 = qd[j], qd[k] qd.add_bond(at1, at2) bonds += ligand_len # Create a nested list with residue indices res_ar = np.arange(ligand_start, len(qd)) res_ar.shape = -1, ligand_len res_list = res_ar.tolist() res_list.insert(0, np.arange(ligand_start)) _assign_residues(qd, res_list) # Create the .psf file psf = PSFContainer() psf.generate_bonds(qd) psf.generate_angles(qd) psf.generate_dihedrals(qd) psf.generate_impropers(qd) psf.generate_atoms(qd) overlay_rtf_file(psf, rtf_file) if rtf_file is not None else None overlay_str_file(psf, str_file) if str_file is not None else None # Set the charge to zero and return psf.charge = 0.0 return psf
[docs]def extract_ligand(qd: Union[str, Molecule], ligand_len: int, ligand_atoms: Union[str, Iterable[str]]) -> Molecule: """Extract a single ligand from **qd**. Parameters ---------- qd : :class:`str` or :class:`Molecule` The ligand-pacifated quantum dot. Should be supplied as either a Molecule or .xyz file. ligand_len : :class:`int` The number of atoms within a single ligand. ligand_atoms : :class:`str` or :class:`Iterable<collections.abc.Iterable>` [:class:`str`] One or multiple strings with the atomic symbols of all atoms within a single ligand. Returns ------- :class:`Molecule` A single ligand Molecule. """ if not isinstance(qd, Molecule): qd = Molecule(qd) if not isinstance(ligand_atoms, set): ligand_atoms = set(ligand_atoms) if not isinstance(ligand_atoms, str) else {ligand_atoms} # Identify where the core ends and the ligands start for i, at in enumerate(qd): if at.symbol in ligand_atoms: break else: raise MoleculeError(f'No atoms {tuple(ligand_atoms)} found within {qd.get_formula()}') # Construct a ligand j = i + ligand_len ligand = Molecule() ligand.atoms = [Atom(atnum=at.atnum, coords=at.coords, mol=ligand) for at in qd.atoms[i:j]] ligand.guess_bonds() return ligand
[docs]@assert_error(RDKIT_ERROR) def generate_psf2(qd: Union[str, Molecule], *ligands: Union[str, Molecule, Mol], rtf_file: Union[None, str, Iterable[str]] = None, str_file: Union[None, str, Iterable[str]] = None, ret_failed_lig: bool = False) -> PSFContainer: r"""Generate a :class:`PSFContainer` instance for **qd** with multiple different **ligands**. Parameters ---------- qd : :class:`str` or :class:`Molecule` The ligand-pacifated quantum dot. Should be supplied as either a Molecule or .xyz file. \*ligands : :class:`str`, :class:`Molecule` or :class:`Chem.Mol` One or more PLAMS/RDkit Molecules and/or SMILES strings representing ligands. rtf_file : :class:`str` or :class:`Iterable<collections.abc.Iterable>` [:class:`str`], optional The path+filename of the ligand's .rtf files. Filenames should be supplied in the same order as **ligands**. Used for assigning atom types. Alternativelly, one can supply a .str file with the **str_file** argument. str_file : :class:`str` or :class:`Iterable<collections.abc.Iterable>` [:class:`str`], optional The path+filename of the ligand's .str files. Filenames should be supplied in the same order as **ligands**. Used for assigning atom types. Alternativelly, one can supply a .rtf file with the **rtf_file** argument. ret_failed_lig : :class:`bool` If ``True``, return a list of all failed (potential) ligands if the function cannot identify any ligands within a certain range. Usefull for debugging. If ``False``, raise a :exc:`MoleculeError`. Returns ------- :class:`Molecule` A single ligand Molecule. Raises ------ :exc:`MoleculeError` Raised if the function fails to identify any ligands within a certain range. If ``ret_failed_lig = True``, return a list of failed (potential) ligands instead and issue a warning. """ if not isinstance(qd, Molecule): qd = Molecule(qd) # Create a dictionary with RDKit molecules and the number of atoms contained therein rdmol_dict = _get_rddict(ligands) # Find the starting atom ligand_atoms = {at.GetAtomicNum() for rdmol in rdmol_dict for at in rdmol.GetAtoms()} for i, at in enumerate(qd): if at.atnum in ligand_atoms: break else: raise MoleculeError(f'No atoms {tuple(PT[i] for i in ligand_atoms)} found ' f'within {qd.get_formula()}') # Identify all bonds and residues res_list = [np.arange(i)] res_dict = {} while True: new, j = _get_initial_lig(qd, rdmol_dict, i) if new is None: break ref0, _ = next(iter(rdmol_dict.items())) for ref, k in rdmol_dict.items(): k = 0 if ref is ref0 else k new = _update_lig(new, k, copy=False) j += k if _get_matches(new, ref): qd.bonds += [Bond(atom1=qd[bond.atom1.id], atom2=qd[bond.atom2.id], order=bond.order, mol=qd) for bond in new.bonds] res_list.append(np.arange(i, i+j)) res_dict[len(res_list)] = id(ref) break else: continue else: err = (f'Failed to identify any ligands {ligands} within the range ' f'[{i}:{i + next(iter(rdmol_dict.items()))[1]}]') if not ret_failed_lig: raise MoleculeError(err) else: warnings.warn(err, category=MoleculeWarning) return _return_failed_ligs(qd, rdmol_dict, i) i += j # Create the .psf file _assign_residues(qd, res_list) psf = PSFContainer() psf.generate_bonds(qd) psf.generate_angles(qd) psf.generate_dihedrals(qd) psf.generate_impropers(qd) psf.generate_atoms(qd, res_dict) if not (rtf_file is str_file is None): _id_dict = group_by_values(res_dict.items()) id_range = (_id_dict[id(k)] for k in rdmol_dict.keys()) _overlay(psf, 'rtf', id_range, rtf_file) if rtf_file is not None else None _overlay(psf, 'str', id_range, str_file) if str_file is not None else None # Set the charge to zero and return psf.charge = 0.0 return psf
def _get_initial_lig(qd: Molecule, rdmol_dict: Mapping[Mol, int], i: int ) -> Tuple[Union[None, Molecule], int]: """Construct a new ligand at the begining of the :func:`generate_psf2` ``while`` loop.""" _, j = next(iter(rdmol_dict.items())) new = Molecule() new.atoms = [Atom(atnum=at.atnum, coords=at.coords, mol=new) for at in qd.atoms[i:i+j]] if not new: return None, j elif len(new) != j: # Pad with dummy atoms new.atoms += [Atom(atnum=0, coords=[0, 0, 0], mol=new) for _ in range(j - len(new))] new.set_atoms_id(start=i+1) return new, j def _update_lig(ligand: Molecule, k: int, copy: bool = False) -> Molecule: """Update a ligand by removing the last **k** atoms.""" ligand = ligand.copy() if copy else ligand atoms_del = ligand.atoms[k:] if k != 0 else [] for at in atoms_del: ligand.delete_atom(at) ligand.guess_bonds() set_integer_bonds(ligand) fix_bond_orders(ligand) return ligand def _return_failed_ligs(qd: Molecule, rdmol_dict: Mapping[Mol, int], i: int) -> List[Molecule]: """Return a list of failed ligands in case :func:`generate_psf2` fails to identify ligands.""" new, j = _get_initial_lig(qd, rdmol_dict, i) ret = [] ref0, _ = next(iter(rdmol_dict.items())) for ref, k in rdmol_dict.items(): k = 0 if ref is ref0 else k new = _update_lig(new, k, copy=True) ret.append(new) j += k return ret class MoleculeWarning(RuntimeWarning): # Molecule related warnings pass MolType = TypeVar('MolType', Molecule, str, Mol) try: #: Map a :class:`type` object to a callable for creating :class:`rdkit.Chem.Mol` instances. MOL_MAPPING: Mapping[Type[MolType], Callable[[MolType], Mol]] = MappingProxyType({ str: lambda mol: to_rdmol(from_smiles(mol)), Molecule: to_rdmol, Mol: lambda mol: mol }) except NameError: MOL_MAPPING = None # rdkit is not installed def _overlay(psf: PSFContainer, mode: str, id_ranges: Iterable[Iterable[int]], files: Union[AnyStr, PathLike, Iterable[AnyStr], Iterable[PathLike]]) -> None: """Overlay one or more .str or .rtf files.""" if not isinstance(files, abc.Iterable) or isinstance(files, (str, bytes)): files_iter = (files,) else: files_iter = files if mode == 'rtf': func = overlay_rtf_file elif mode == 'str': func = overlay_str_file else: raise ValueError(f"'mode' expected either 'rtf' or 'str'; supplied value: {repr(mode)}") for file, id_range in zip(files_iter, id_ranges): func(psf, file, id_range=id_range) def _items_sorted(dct: Mapping) -> Iterator[Tuple[Hashable, Any]]: """Return a :meth:`dict.items()` iterator whose items are sorted by the dictionary values.""" return iter(sorted(dct.items(), key=lambda kv: kv[1], reverse=True)) @assert_error(RDKIT_ERROR) def _get_matches(mol: Molecule, ref: Mol) -> bool: """Check if the structures of **mol** and **ref** match.""" try: rdmol = to_rdmol(mol) except ArgumentError: return False matches = rdmol.GetSubstructMatches(ref) match_set = set(chain.from_iterable(matches)) return match_set == set(range(len(mol))) and len(match_set) == len(mol) @assert_error(RDKIT_ERROR) def _get_rddict(ligands: Iterable[Union[str, Molecule, Mol]]) -> MutableMapping[Mol, int]: """Create an ordered dict with rdkit molecules and delta atom counts for :func:`generate_psf`.""" # noqa tmp_dct = {MOL_MAPPING[type(lig)](lig): 0 for lig in ligands} for rdmol in tmp_dct: tmp_dct[rdmol] = len(rdmol.GetAtoms()) v_old = 0 rdmol_dict = OrderedDict() for k, v in _items_sorted(tmp_dct): rdmol_dict[k] = v - v_old v_old = v return rdmol_dict def set_integer_bonds(self): """Convert non-integer bond orders into integers. For example, bond orders of aromatic systems are no longer set to the non-integer value of ``1.5``, instead adopting bond orders of ``1`` and ``2``. The implemented function walks a set of graphs constructed from all non-integer bonds, converting the orders of aforementioned bonds to integers by alternating calls to :func:`math.ceil` and :func:`math.floor`. The implication herein is that both :math:`i` and :math:`i+1` are considered valid (integer) values for any bond order within the :math:`(i, i+1)` interval. Floats which can be represented exactly as an integer, *e.g.* :math:`1.0`, are herein treated as integers. Can be used for sanitizaing any Molecules passed to the :mod:`rdkit<scm.plams.interfaces.molecule.rdkit>` module, as its functions are generally unable to handle Molecules with non-integer bond orders. ..code:: python >>> from scm.plams import Molecule >>> benzene = Molecule(...) >>> print(benzene) Atoms: 1 C 1.193860 -0.689276 0.000000 2 C 1.193860 0.689276 0.000000 3 C 0.000000 1.378551 0.000000 4 C -1.193860 0.689276 0.000000 5 C -1.193860 -0.689276 0.000000 6 C -0.000000 -1.378551 0.000000 7 H 2.132911 -1.231437 -0.000000 8 H 2.132911 1.231437 -0.000000 9 H 0.000000 2.462874 -0.000000 10 H -2.132911 1.231437 -0.000000 11 H -2.132911 -1.231437 -0.000000 12 H -0.000000 -2.462874 -0.000000 Bonds: (3)--1.5--(4) (5)--1.5--(6) (1)--1.5--(6) (2)--1.5--(3) (4)--1.5--(5) (1)--1.5--(2) (3)--1.0--(9) (6)--1.0--(12) (5)--1.0--(11) (4)--1.0--(10) (2)--1.0--(8) (1)--1.0--(7) >>> benzene.set_integer_bonds() >>> print(benzene) Atoms: 1 C 1.193860 -0.689276 0.000000 2 C 1.193860 0.689276 0.000000 3 C 0.000000 1.378551 0.000000 4 C -1.193860 0.689276 0.000000 5 C -1.193860 -0.689276 0.000000 6 C -0.000000 -1.378551 0.000000 7 H 2.132911 -1.231437 -0.000000 8 H 2.132911 1.231437 -0.000000 9 H 0.000000 2.462874 -0.000000 10 H -2.132911 1.231437 -0.000000 11 H -2.132911 -1.231437 -0.000000 12 H -0.000000 -2.462874 -0.000000 Bonds: (3)--1.0--(4) (5)--1.0--(6) (1)--2.0--(6) (2)--2.0--(3) (4)--2.0--(5) (1)--1.0--(2) (3)--1.0--(9) (6)--1.0--(12) (5)--1.0--(11) (4)--1.0--(10) (2)--1.0--(8) (1)--1.0--(7) """ ceil = math.ceil floor = math.floor func_invert = {ceil: floor, floor: ceil} def dfs(atom, func) -> None: """Depth-first search algorithm for integer-ifying the bond orders.""" for b2 in atom.bonds: if b2._visited: continue b2._visited = True b2.order = func(b2.order) # func = ``math.ceil()`` or ``math.floor()`` del bond_dict[b2] atom_new = b2.atom1 if b2.atom1 is not atom else b2.atom2 dfs(atom_new, func=func_invert[func]) # Mark all non-integer bonds; floats which can be represented exactly # by an integer (e.g. 1.0 and 2.0) are herein treated as integers bond_dict = OrderedDict() # An improvised OrderedSet (as it does not exist) for bond in self.bonds: if hasattr(bond.order, 'is_integer') and not bond.order.is_integer(): bond._visited = False bond_dict[bond] = None else: bond._visited = True while bond_dict: b1, _ = bond_dict.popitem() order = b1.order # Start with either ``math.ceil()`` if the ceiling is closer than the floor; # start with ``math.floor()`` otherwise delta_ceil, delta_floor = ceil(order) - order, floor(order) - order func = ceil if abs(delta_ceil) < abs(delta_floor) else floor b1.order = func(order) b1._visited = True dfs(b1.atom1, func=func_invert[func]) dfs(b1.atom2, func=func_invert[func]) for bond in self.bonds: del bond._visited