Source code for matador.scrapers.cif_scraper

# coding: utf-8
# Distributed under the terms of the MIT License.

""" This file implements the scraper functions for the CIF
(Crystallographic Information File) format.

"""

import functools
from pathlib import Path
import numpy as np
from matador.scrapers.utils import scraper_function, get_flines_extension_agnostic
from matador.utils.cell_utils import (
    get_spacegroup_spg,
    abc2cart,
    cart2volume,
    frac2cart,
)

EPS = 1e-13


[docs]@scraper_function
def cif2dict(fname, **kwargs):
    """Extract available information from  .cif file and store as a
    dictionary. Raw cif data is stored under the `'_cif'` key. Symmetric
    sites are expanded by the symmetry operations and their occupancies
    are tracked.

    Parameters:
        fname (str/list): filename or list of filenames of .cif file(s)
            (with or without extension).

    Returns:
        (dict/str, bool): if successful, a dictionary containing scraped
            data and True, if not, then an error string and False.


    """

    flines, fname = get_flines_extension_agnostic(fname, "cif")

    doc = dict()
    cif_dict = _cif_parse_raw(flines)

    doc["_cif"] = cif_dict
    doc["source"] = [str(Path(fname).resolve())]

    doc["atom_types"] = []
    atom_labels = cif_dict.get("_atom_site_type_symbol", False)
    if not atom_labels:
        atom_labels = cif_dict.get("_atom_site_label", False)
    if not atom_labels:
        raise RuntimeError(f"Unable to find atom types in cif file {fname}.")

    for atom in atom_labels:
        symbol = ""
        for character in atom:
            if not character.isalpha():
                break
            else:
                symbol += character
        doc["atom_types"].append(symbol)

    doc["positions_frac"] = [
        list(map(lambda x: float(x.split("(")[0]), vector))
        for vector in zip(
            cif_dict["_atom_site_fract_x"],
            cif_dict["_atom_site_fract_y"],
            cif_dict["_atom_site_fract_z"],
        )
    ]

    if "_atom_site_occupancy" in cif_dict:
        doc["site_occupancy"] = [
            float(x.split("(")[0]) for x in cif_dict["_atom_site_occupancy"]
        ]
    else:
        doc["site_occupancy"] = [1.0 for _ in doc["positions_frac"]]

    if "_atom_site_symmetry_multiplicity" in cif_dict:
        doc["site_multiplicity"] = [
            float(x.split("(")[0]) for x in cif_dict["_atom_site_symmetry_multiplicity"]
        ]
    else:
        doc["site_multiplicity"] = [1.0 for _ in doc["positions_frac"]]

    doc["lattice_abc"] = [
        list(
            map(
                _cif_parse_float_with_errors,
                [
                    cif_dict["_cell_length_a"],
                    cif_dict["_cell_length_b"],
                    cif_dict["_cell_length_c"],
                ],
            )
        ),
        list(
            map(
                _cif_parse_float_with_errors,
                [
                    cif_dict["_cell_angle_alpha"],
                    cif_dict["_cell_angle_beta"],
                    cif_dict["_cell_angle_gamma"],
                ],
            )
        ),
    ]

    doc["lattice_cart"] = abc2cart(doc["lattice_abc"])
    doc["cell_volume"] = cart2volume(doc["lattice_cart"])
    doc["stoichiometry"] = _cif_disordered_stoichiometry(doc)
    doc["num_atoms"] = len(doc["positions_frac"])

    if (
        "_space_group_symop_operation_xyz" in doc["_cif"]
        and "_symmetry_equiv_pos_as_xyz" not in doc["_cif"]
    ):
        doc["_cif"]["_symmetry_equiv_pos_as_xyz"] = doc["_cif"][
            "_space_group_symop_operation_xyz"
        ]
    if "_symmetry_equiv_pos_as_xyz" in doc["_cif"]:
        _cif_set_unreduced_sites(doc)

    try:
        doc["space_group"] = get_spacegroup_spg(doc, check_occ=False)
    except RuntimeError:
        pass

    return doc, True


def _cif_parse_float_with_errors(x):
    """Strip bracketed errors from end of float."""
    return float(x.split("(")[0])


def _cif_disordered_stoichiometry(doc):
    """Create a matador stoichiometry normalised to the smallest integer
    number of atoms, unless all occupancies are 1/0.

    Parameters:
        doc: dictionary containing `atom_types`, `site_occupancy` and
            `site_multiplicity` keys.

    Returns:
        list of tuples: a standard matador stoichiometry.

    """
    from collections import defaultdict

    stoich = defaultdict(float)
    eps = 1e-8
    disordered = False
    for ind, site in enumerate(doc["atom_types"]):
        stoich[site] += doc["site_occupancy"][ind] * doc["site_multiplicity"][ind]
        if doc["site_multiplicity"][ind] % 1 > 1e-5:
            disordered = True

    if disordered:
        min_int = 1e10
        for atom in stoich:
            if abs(int(stoich[atom]) - stoich[atom]) < eps:
                if int(stoich[atom]) < min_int:
                    min_int = int(stoich[atom])

        if min_int == 1e10:
            min_int = 1
        for atom in stoich:
            stoich[atom] /= min_int

    return sorted([[atom, stoich[atom]] for atom in stoich])


def _cif_parse_raw(flines):
    """Parse raw CIF file data into a dictionary.

    Parameters:
        flines (:obj:`list` of :obj:`str`): contents of .cif file.

    Returns:
        dict: dictionary containing cif data with native fields/ordering.

    """
    ind = 0
    cif_dict = dict()
    cif_dict["loops"] = list()
    while ind < len(flines):
        jnd = 1
        line = flines[ind].strip()
        # parse single (multi-line) tag
        if line.startswith("_"):
            line = line.split()
            key = line[0]
            data = ""
            if len(line) > 1:
                data += " ".join(line[1:])
            while ind + jnd < len(flines) and _cif_line_contains_data(
                flines[ind + jnd].strip()
            ):
                data += flines[ind + jnd].strip().replace(";", "")
                jnd += 1
            cif_dict[key] = data.strip()
        # parse loop block
        elif line.startswith("loop_"):
            # get loop keys
            keys = []
            while flines[ind + jnd].strip().startswith("_"):
                keys.append(flines[ind + jnd].strip())
                jnd += 1
            for key in keys:
                cif_dict[key] = []
            cif_dict["loops"].append(keys)
            while ind + jnd < len(flines) and _cif_line_contains_data(
                flines[ind + jnd].strip()
            ):
                data = ""
                # loop over line and next lines
                while ind + jnd < len(flines) and _cif_line_contains_data(
                    flines[ind + jnd]
                ):
                    data += flines[ind + jnd]
                    jnd += 1

            loop_dict = _cif_parse_loop(keys, data)
            cif_dict.update(loop_dict)

        ind += jnd

    return cif_dict


def _cif_parse_loop(keys, data_block):
    """A hacky way to parse CIF data loops that can be split by quotes
    or spaces. There must be a better way...

    Parameters:
        keys (list of str): list of keys for the loop.
        data_block (str): raw string of the entire data block.

    Returns:
        Dict[str, str]: a dictionary with keys from ``keys``, containing the
            data split by quotes and spaces. All data is left as
            strings for further processing.

    """

    from collections import deque, defaultdict

    dq = deque(data_block)
    data_list = []
    entry = None
    in_quotes = False
    while dq:
        char = dq.popleft()
        if not char.strip() and entry is None:
            continue
        elif (
            (not char.strip() or char in [" ", ";"])
            and entry is not None
            and not in_quotes
        ):
            data_list.append(entry.strip())
            entry = None
        elif not char.strip() and entry is not None and in_quotes:
            entry += " "
        elif char == "'" and entry and entry is not None:
            in_quotes = False
            data_list.append(entry.strip())
            entry = None
        elif char == "'" and entry is None:
            entry = ""
            in_quotes = True
        else:
            if entry is None:
                entry = char
            else:
                entry += char

    loop_dict = defaultdict(list)
    for ind, entry in enumerate(data_list):
        ind = ind % len(keys)
        loop_dict[keys[ind]].append(entry)

    return loop_dict


def _cif_set_unreduced_sites(doc):
    """Expands sites by symmetry operations found under the key
    `symemtry_equiv_pos_as_xyz` in the cif_dict.

    Parameters:
        doc (dict): matador document to modify. Must contain symops
            under doc['_cif']['_symmetry_equiv_pos_as_xyz']. This doc
            is updated with new `positions_frac`, `num_atoms`, `atom_types`
            and `site_occupancy`.

    """
    from matador.utils.cell_utils import wrap_frac_coords
    from matador.utils.cell_utils import calc_pairwise_distances_pbc
    from matador.fingerprints.pdf import PDF

    species_sites = dict()
    species_occ = dict()

    symmetry_ops = []
    symmetry_functions = []

    def _apply_sym_op(x=None, y=None, z=None, symmetry=None):
        """Returns the site after the applied symmetry operation, in string representation."""
        # cannot use a listcomp here due to interplay with functools
        return [eval(symmetry[0]), eval(symmetry[1]), eval(symmetry[2])]

    for symmetry in doc["_cif"]["_symmetry_equiv_pos_as_xyz"]:
        symmetry = tuple(elem.strip() for elem in symmetry.strip("'").split(","))
        # check the element before doing an eval, as it is so unsafe
        allowed_chars = [
            "x",
            "y",
            "z",
            ".",
            "/",
            "+",
            "-",
            "0",
            "1",
            "2",
            "3",
            "4",
            "5",
            "6",
            "7",
            "8",
            "9",
        ]
        for element in symmetry:
            for character in element:
                if character not in allowed_chars:
                    raise RuntimeError(
                        "You are trying to do something naughty with the symmetry element {}".format(
                            element
                        )
                    )

        symmetry_ops.append(symmetry)
        symmetry_functions.append(functools.partial(_apply_sym_op, symmetry=symmetry))

    for ind, site in enumerate(doc["positions_frac"]):
        species = doc["atom_types"][ind]
        occupancy = doc["site_occupancy"][ind]
        if doc["atom_types"][ind] not in species_sites:
            species_sites[species] = []
            species_occ[species] = []
        for symmetry in symmetry_functions:
            x, y, z = site
            new_site = symmetry(x=x, y=y, z=z)
            new_site = wrap_frac_coords([new_site])[0]
            species_sites[species].append(new_site)
            species_occ[species].append(occupancy)

    unreduced_sites = []
    unreduced_occupancies = []
    unreduced_species = []

    # this loop assumes that no symmetry operation can map 2 unlike sites upon one another
    for species in species_sites:
        unreduced_sites.extend(species_sites[species])
        unreduced_occupancies.extend(species_occ[species])
        unreduced_species.extend(len(species_sites[species]) * [species])

    # check that the symmetry procedure has not generated overlapping atoms
    # this can happen for certain symmetries/cells if positions are not
    # reported to sufficient precision
    images = PDF._get_image_trans_vectors_auto(
        doc["lattice_cart"],
        0.1,
        0.01,
        max_num_images=1,
    )

    poscarts = frac2cart(doc["lattice_cart"], unreduced_sites)
    distances = calc_pairwise_distances_pbc(
        poscarts,
        images,
        doc["lattice_cart"],
        0.01,
        compress=False,
        filter_zero=False,
        per_image=True,
    )

    dupe_set = set()
    for img in distances:
        try:
            i_s, j_s = np.where(~img.mask)
        except ValueError:
            # ValueError will be raised if there is only one atom as i_s, j_s cannot be unpacked
            continue
        for i, j in zip(i_s, j_s):
            if i == j:
                continue
            else:
                # sites can overlap if they have partial occupancy
                if i not in dupe_set and unreduced_species[i] == unreduced_species[j]:
                    dupe_set.add(j)

    doc["positions_frac"] = unreduced_sites
    doc["site_occupancy"] = unreduced_occupancies
    doc["atom_types"] = unreduced_species

    doc["site_occupancy"] = [
        atom for ind, atom in enumerate(unreduced_occupancies) if ind not in dupe_set
    ]
    doc["atom_types"] = [
        atom for ind, atom in enumerate(unreduced_species) if ind not in dupe_set
    ]
    doc["positions_frac"] = [
        atom for ind, atom in enumerate(unreduced_sites) if ind not in dupe_set
    ]

    _num_atoms = np.sum(doc["site_occupancy"])
    if abs(_num_atoms - round(_num_atoms, 0)) < EPS:
        _num_atoms = int(round(_num_atoms, 0))
    doc["num_atoms"] = _num_atoms

    if len(doc["site_occupancy"]) != len(doc["positions_frac"]):
        raise RuntimeError(
            "Size mismatch between positions and occs, {} vs {}".format(
                len(doc["site_occupancy"]), len(doc["positions_frac"])
            )
        )
    if len(doc["positions_frac"]) != len(doc["atom_types"]):
        raise RuntimeError("Size mismatch between positions and types")


def _cif_line_contains_data(line):
    """Check if string contains cif-style data."""
    return not any(
        [line.startswith("_"), line.startswith("#"), line.startswith("loop_")]
    )


@scraper_function
def _ase_cif2dict(fname):
    """Read cif file into ASE object,
    then convert ASE Atoms into matador document.

    Parameters:
        fname (str): cif filename

    Returns:
        (dict, bool): simple matador document with error status.

    """
    import ase.io
    from matador.utils.ase_utils import ase2dict

    fname = fname.replace(".cif", "")
    atoms = ase.io.read(fname + ".cif")
    doc = ase2dict(atoms)

    return doc, True