Source code for matador.scrapers.castep_scrapers

# coding: utf-8
# Distributed under the terms of the MIT License.

""" This file implements the scraper functions for CASTEP-related
inputs and outputs.

"""


from time import strptime
from collections import defaultdict
import os
import glob
import warnings
from pathlib import Path

import numpy as np
from matador.utils.cell_utils import (
    abc2cart,
    calc_mp_spacing,
    cart2volume,
    wrap_frac_coords,
    cart2abc,
    cart2frac,
)
from matador.utils.chem_utils import get_stoich, INVERSE_CM_TO_EV
from matador.scrapers.utils import (
    DFTError,
    ComputationError,
    scraper_function,
    f90_float_parse,
    get_flines_extension_agnostic,
)


[docs]@scraper_function
def res2dict(fname, db=True, **kwargs):
    """Extract available information from .res file; preferably
    used in conjunction with cell or param file.

    Parameters:
        fname (str or list): filename or list of filenames of res file(s)
            (with or without extension).

    Keyword arguments:
        db (bool): whether to fail if unable to scrape energies.

    Returns:
        (tuple): containing either dict/str containing data or error, and a bool stating
            if the scrape was successful.

    """

    flines, fname = get_flines_extension_agnostic(fname, "res")
    res = dict()

    # add .res to source
    res["source"] = [fname]
    # grab file owner username
    try:
        import pwd

        res["user"] = pwd.getpwuid(os.stat(fname).st_uid).pw_name
    except Exception:
        res["user"] = "xxx"

    try:
        get_seed_metadata(res, fname)
    except Exception as exc:
        warnings.warn(
            f"There was an error scraping provenance from filename {fname}: {exc}"
        )

    # alias special lines in res file
    titl = None
    cell = None
    remark = None
    for line in flines:
        if line.startswith("TITL") and db:
            # if not db, then don't read title
            titl = line.split()
            if len(titl) != 12:
                raise RuntimeError(f"Missing some TITL info in {fname}")
        elif line.startswith("CELL"):
            cell = line.split()
        elif line.startswith("REM"):
            remark = line.split()
        elif line.startswith("SFAC"):
            break

    if not cell:
        raise RuntimeError(f"Missing CELL line from {fname}")
    if not titl and db:
        raise RuntimeError(f"Missing TITL line in {fname}")
    if db:
        res["pressure"] = f90_float_parse(titl[2])
        res["cell_volume"] = f90_float_parse(titl[3])
        res["enthalpy"] = f90_float_parse(titl[4])
        res["num_atoms"] = int(titl[7])
        res["space_group"] = titl[8].strip("()")
        res["enthalpy_per_atom"] = res["enthalpy"] / res["num_atoms"]

    res["lattice_abc"] = [
        list(map(f90_float_parse, cell[2:5])),
        list(map(f90_float_parse, cell[5:8])),
    ]
    # calculate lattice_cart from abc
    res["lattice_cart"] = abc2cart(res["lattice_abc"])
    if "cell_volume" not in res:
        res["cell_volume"] = cart2volume(res["lattice_cart"])
    res["atom_types"] = []
    res["positions_frac"] = []
    res["site_occupancy"] = []
    for line_no, line in enumerate(flines):
        if "SFAC" in line:
            i = 1
            while "END" not in flines[line_no + i] and line_no + i < len(flines):
                # check if we don't have some other SHELX keyword in the way, e.g. "UNIT"
                if len(flines[line_no + i]) >= 4 and all(
                    [char.isupper() for char in flines[line_no + i][0:4]]
                ):
                    i += 1
                    continue
                cursor = flines[line_no + i].split()
                res["atom_types"].append(cursor[0])
                res["positions_frac"].append(list(map(f90_float_parse, cursor[2:5])))
                try:
                    res["site_occupancy"].append(f90_float_parse(cursor[5]))
                except IndexError:
                    res["site_occupancy"].append(1.0)
                i += 1

            break

    res["positions_frac"] = wrap_frac_coords(res["positions_frac"])
    res["num_atoms"] = len(res["atom_types"])
    # Parse any remark regarding implicit nanotube encapsulation
    if remark:
        if "NTPROPS" in remark:
            res["cnt_chiral"] = [0, 0]
            res["encapsulated"] = True
            for ind, entry in enumerate(remark):
                if "chiralN" in entry:
                    res["cnt_chiral"][0] = int(remark[ind + 1].replace(",", ""))
                if "chiralM" in entry:
                    res["cnt_chiral"][1] = int(remark[ind + 1].replace(",", ""))
                if entry == "'r':":
                    res["cnt_radius"] = f90_float_parse(
                        remark[ind + 1].replace(",", "")
                    )
                if entry == "'z':":
                    temp_length = remark[ind + 1].replace(",", "")
                    temp_length = temp_length.replace("\n", "")
                    temp_length = temp_length.replace("}", "")
                    res["cnt_length"] = f90_float_parse(temp_length)

    res["stoichiometry"] = get_stoich(res["atom_types"])
    res["num_fu"] = len(res["atom_types"]) / sum(
        [elem[1] for elem in res["stoichiometry"]]
    )

    return res, True


[docs]@scraper_function
def cell2dict(fname, db=False, lattice=True, positions=True, **kwargs):
    """Extract available information from .cell file; probably
    to be merged with another dict from a .param or .res file.

    Parameters:
        fname (str/list): filename or list of filenames of cell file(s)
            to scrape, with or without extension.

    Keyword arguments:
        db (bool): scrape database quality file
        lattice (bool): scrape lattice vectors
        positions (bool): scrape positions

    Returns:
        (tuple): containing either dict/str containing data or error, and a bool stating
            if the scrape was successful.

    """
    cell = dict()
    flines, fname = get_flines_extension_agnostic(fname, "cell")

    # add cell file to source
    cell["source"] = [fname]

    for line_no, line in enumerate(flines):
        if line.startswith(("#", "!")):
            continue
        if "#" or "!" in line:
            line = line.split("#")[0].split("!")[0]
        if "%block lattice_cart" in line.lower() and lattice:
            cell["lattice_cart"] = []
            i = 1
            while "endblock" not in flines[line_no + i].lower():
                if not flines[line_no + i].strip()[0].isalpha():
                    cell["lattice_cart"].append(
                        list(map(f90_float_parse, flines[line_no + i].split()))
                    )
                    if not len(cell["lattice_cart"][-1]) == 3:
                        raise RuntimeError(
                            "Lattice vector does not have enough elements!"
                        )
                i += 1
            if not len(cell["lattice_cart"]) == 3:
                raise RuntimeError("Wrong number of lattice vectors!")
        elif "%block lattice_abc" in line.lower() and lattice:
            cell["lattice_abc"] = []
            i = 1
            while "endblock" not in flines[line_no + i].lower():
                if not flines[line_no + i].strip()[0].isalpha():
                    cell["lattice_abc"].append(
                        list(map(f90_float_parse, flines[line_no + i].split()))
                    )
                    if not len(cell["lattice_abc"][-1]) == 3:
                        raise RuntimeError(
                            "Lattice vector does not have enough elements!"
                        )
                i += 1
            if not len(cell["lattice_abc"]) == 2:
                raise RuntimeError("Wrong specification of lattice_abc")

        elif "%block species_pot" in line.lower():
            cell["species_pot"] = dict()
            i = 1
            while "endblock" not in flines[line_no + i].lower():
                # handle blank lines in species pot
                split_line = flines[line_no + i].split()
                if not split_line:
                    i += 1
                    continue
                if len(split_line) == 2:
                    species = split_line[0]
                    pspot_string = split_line[1].split("/")[-1]
                    cell["species_pot"][species] = pspot_string.replace(
                        "()", ""
                    ).replace("[]", "")
                elif db:
                    raise RuntimeError(
                        f"Cannot parse `species_pot` block line {flines[line_no+i]} with `db=True`, "
                        "expected a (species, pspot) pair. "
                        "Try using `db=False` if specifying a pspot library for all species."
                    )
                elif len(split_line) == 1:
                    cell["species_pot"]["library"] = split_line[0].upper()

                i += 1
            if not cell["species_pot"]:
                cell.pop("species_pot")
        elif "%block cell_constraints" in line.lower():
            cell["cell_constraints"] = []
            for j in range(2):
                cell["cell_constraints"].append(
                    list(map(int, flines[line_no + j + 1].split()))
                )
            if (
                any(len(cell["cell_constraints"][i]) != 3 for i in range(2))
                or len(cell["cell_constraints"]) != 2
            ):
                raise RuntimeError("Invalid cell constraints block.")
        elif "%block hubbard_u" in line.lower():
            cell["hubbard_u"] = defaultdict(list)
            i = 0
            while "endblock" not in flines[line_no + i].lower():
                line = flines[line_no + i]
                if line == "eV" or len(line.split()) < 3:
                    i += 1
                    continue
                else:
                    atom = line.split()[0]
                    orbital = line.split()[1].replace(":", "")
                    shift = f90_float_parse(line.split()[-1])
                    atom = line.split()[0]
                    cell["hubbard_u"][atom] = dict()
                    cell["hubbard_u"][atom][orbital] = shift
                    i += 1
        elif "%block external_pressure" in line.lower():
            cell["external_pressure"] = np.zeros((3, 3))
            i = 1
            j = 0
            while "endblock" not in flines[line_no + i].lower():
                if not flines[line_no + i].strip()[0].isalpha():
                    flines[line_no + i] = flines[line_no + i].replace(",", "")
                    vals = list(map(f90_float_parse, flines[line_no + i].split()))
                    if len(vals) != (3 - j):
                        raise RuntimeError(
                            "External pressure should be specified as upper triangular matrix."
                        )
                    cell["external_pressure"][j] = np.asarray(j * [0.0] + vals).reshape(
                        3
                    )
                    j += 1
                i += 1
            cell["external_pressure"] = cell["external_pressure"].tolist()

        elif "%block external_efield" in line.lower():
            cell["external_efield"] = [
                f90_float_parse(e) for e in flines[line_no + 1].split()
            ]
            if len(cell["external_efield"]) != 3:
                raise RuntimeError(
                    f"EXTERNAL_EFIELD block has wrong shape, should be 3-D not: {cell['external_efield']}"
                )

        elif "%block ionic_constraints" in line.lower():
            cell["ionic_constraints"] = []
            i = 1
            while "endblock" not in flines[line_no + i].lower():
                cell["ionic_constraints"].append(flines[line_no + i].strip())
                i += 1
        # parse kpoints
        elif (
            "kpoints_mp_spacing" in line.lower() or "kpoint_mp_spacing" in line.lower()
        ):
            if (
                "spectral_kpoints_mp_spacing" in line.lower()
                or "spectral_kpoint_mp_spacing" in line.lower()
            ):
                cell["spectral_kpoints_mp_spacing"] = f90_float_parse(line.split()[-1])
            elif (
                "bs_kpoints_mp_spacing" in line.lower()
                or "bs_kpoint_mp_spacing" in line.lower()
            ):
                cell["spectral_kpoints_mp_spacing"] = f90_float_parse(line.split()[-1])
            elif (
                "supercell_kpoints_mp_spacing" in line.lower()
                or "supercell_kpoint_mp_spacing" in line.lower()
            ):
                cell["supercell_kpoints_mp_spacing"] = f90_float_parse(line.split()[-1])
            elif (
                "phonon_kpoints_mp_spacing" in line.lower()
                or "phonon_kpoint_mp_spacing" in line.lower()
            ):
                cell["phonon_kpoint_mp_spacing"] = f90_float_parse(line.split()[-1])
            elif (
                "phonon_fine_kpoints_mp_spacing" in line.lower()
                or "phonon_fine_kpoint_mp_spacing" in line.lower()
            ):
                cell["phonon_fine_kpoint_mp_spacing"] = f90_float_parse(
                    line.split()[-1]
                )
            else:
                cell["kpoints_mp_spacing"] = f90_float_parse(line.split()[-1])
        elif "kpoints_mp_grid" in line.lower() or "kpoint_mp_grid" in line.lower():
            if (
                "spectral_kpoints_mp_grid" in line.lower()
                or "spectral_kpoint_mp_grid" in line.lower()
            ):
                cell["spectral_kpoints_mp_grid"] = list(map(int, line.split()[-3:]))
            if (
                "bs_kpoints_mp_grid" in line.lower()
                or "bs_kpoint_mp_grid" in line.lower()
            ):
                cell["spectral_kpoints_mp_grid"] = list(map(int, line.split()[-3:]))
            # these two keywords do not have a corresponding "kpoints" alias (instead of kpoint)
            # so must remain unpluralised (see below for other phonon keyword exceptions)
            elif (
                "phonon_kpoints_mp_grid" in line.lower()
                or "phonon_kpoint_mp_grid" in line.lower()
            ):
                cell["phonon_kpoint_mp_grid"] = list(map(int, line.split()[-3:]))
            elif (
                "phonon_fine_kpoints_mp_grid" in line.lower()
                or "phonon_fine_kpoint_mp_grid" in line.lower()
            ):
                cell["phonon_fine_kpoint_mp_grid"] = list(map(int, line.split()[-3:]))
            else:
                cell["kpoints_mp_grid"] = list(map(int, line.split()[-3:]))
        elif "kpoints_mp_offset" in line.lower() or "kpoint_mp_offset" in line.lower():
            if (
                "spectral_kpoints_mp_offset" in line.lower()
                or "spectral_kpoint_mp_offset" in line.lower()
            ):
                cell["spectral_kpoints_mp_offset"] = list(
                    map(f90_float_parse, line.split()[-3:])
                )
            if (
                "bs_kpoints_mp_offset" in line.lower()
                or "bs_kpoint_mp_offset" in line.lower()
            ):
                cell["spectral_kpoints_mp_offset"] = list(
                    map(f90_float_parse, line.split()[-3:])
                )
            elif (
                "phonon_kpoints_mp_offset" in line.lower()
                or "phonon_kpoint_mp_offset" in line.lower()
            ):
                # this is a special case where phonon_kpointS_mp_offset doesn't exist
                cell["phonon_kpoint_mp_offset"] = list(
                    map(f90_float_parse, line.split()[-3:])
                )
            elif (
                "phonon_fine_kpoints_mp_offset" in line.lower()
                or "phonon_fine_kpoint_mp_offset" in line.lower()
            ):
                cell["phonon_fine_kpoint_mp_offset"] = list(
                    map(f90_float_parse, line.split()[-3:])
                )
            else:
                cell["kpoints_mp_offset"] = list(
                    map(f90_float_parse, line.split()[-3:])
                )
        elif "%block" in line.lower() and (
            "_kpoints_path" in line.lower() or "_kpoint_path" in line.lower()
        ):
            if "spectral_" in line.lower() or "bs_" in line.lower():
                key = "spectral_kpoints_path"
            elif "phonon_fine_" in line.lower():
                key = "phonon_fine_kpoint_path"
            else:
                raise RuntimeError(f"Found unknown kpoint path key in line: {line}.")
            labels = []
            found_labels = False
            comment_delims = ["!", "%", "#"]
            cell[key] = []
            for line in flines[line_no + 1 :]:
                if "%endblock" in line.lower():
                    break

                # ignore CASTEP BREAK keyword and let dispersion script figure out discontinuities
                if "BREAK" in line.strip().upper():
                    continue

                cell[key].append(list(map(f90_float_parse, line.split()[:3])))

                for delim in comment_delims:
                    if delim in line:
                        labels.append(line.split(delim)[-1].strip())
                        found_labels = True
                        break
                else:
                    labels.append(None)

            else:
                raise RuntimeError(
                    f"Unable to find closing of block {key}, missing %endblock."
                )

            if found_labels:
                cell[key + "_labels"] = labels

        elif any(
            block in line.lower()
            for block in [
                "%block spectral_kpoints_list",
                "%block spectral_kpoint_list",
                "%block bs_kpoint_list",
                "%block bs_kpoints_list",
            ]
        ):
            i = 1
            cell["spectral_kpoints_list"] = []
            while "%endblock" not in flines[line_no + i].lower():
                cell["spectral_kpoints_list"].append(
                    list(map(f90_float_parse, flines[line_no + i].split()[:4]))
                )
                i += 1
        elif (
            "%block phonon_fine_kpoints_list" in line.lower()
            or "%block phonon_fine_kpoint_list" in line.lower()
        ):
            i = 1
            # this is a special case where phonon_fine_kpointS_list doesn't exist
            cell["phonon_fine_kpoint_list"] = []
            while "%endblock" not in flines[line_no + i].lower():
                cell["phonon_fine_kpoint_list"].append(
                    list(map(f90_float_parse, flines[line_no + i].split()[:4]))
                )
                i += 1
        elif "%block phonon_supercell_matrix" in line.lower():
            cell["phonon_supercell_matrix"] = []
            i = 1
            while "endblock" not in flines[line_no + i].lower():
                cell["phonon_supercell_matrix"].append(
                    list(map(int, flines[line_no + i].split()))
                )
                if not len(cell["phonon_supercell_matrix"][-1]) == 3:
                    raise RuntimeError(
                        "Supercell matrix row does not have enough elements!"
                    )
                i += 1
            if not len(cell["phonon_supercell_matrix"]) == 3:
                raise RuntimeError("Wrong supercell matrix shape!")

        elif not db:
            if "%block positions_frac" in line.lower():
                atomic_init_spins = []
                i = 1
                if positions:
                    cell["atom_types"] = []
                    cell["positions_frac"] = []
                while "%endblock positions_frac" not in flines[line_no + i].lower():
                    line = flines[line_no + i].split()
                    if positions:
                        cell["atom_types"].append(line[0])
                        cell["positions_frac"].append(
                            list(map(f90_float_parse, line[1:4]))
                        )
                    if "spin=" in flines[line_no + i].lower():
                        split_line = flines[line_no + i].split()
                        atomic_init_spins.append(
                            float(split_line[-1].lower().replace("spin=", ""))
                        )
                    else:
                        atomic_init_spins.append(None)
                    i += 1
                if any(atomic_init_spins):
                    cell["atomic_init_spins"] = atomic_init_spins
                    if len(cell["atomic_init_spins"]) != len(cell["positions_frac"]):
                        raise RuntimeError("Atomic init spins do not match positions")
                if positions:
                    cell["positions_frac"] = wrap_frac_coords(cell["positions_frac"])
                    cell["num_atoms"] = len(cell["atom_types"])
            elif "%block positions_abs" in line.lower():
                atomic_init_spins = []
                i = 1
                # avoid units
                if len(flines[line_no + i].split()) < 3:
                    i += 1
                if positions:
                    cell["atom_types"] = []
                    cell["positions_abs"] = []
                while "%endblock positions_abs" not in flines[line_no + i].lower():
                    line = flines[line_no + i].split()
                    if positions:
                        cell["atom_types"].append(line[0])
                        cell["positions_abs"].append(
                            list(map(f90_float_parse, line[1:4]))
                        )
                    if "spin=" in flines[line_no + i].lower():
                        split_line = flines[line_no + i].split()
                        atomic_init_spins.append(
                            float(split_line[-1].lower().replace("spin=", ""))
                        )
                    else:
                        atomic_init_spins.append(None)
                    i += 1
                if any(atomic_init_spins):
                    cell["atomic_init_spins"] = atomic_init_spins
                    if len(cell["atomic_init_spins"]) != len(cell["positions_frac"]):
                        raise RuntimeError("Atomic init spins do not match positions")

            elif "fix_com" in line.lower():
                cell["fix_com"] = bool(line.split()[-1])
            elif "fix_all_ions" in line.lower():
                cell["fix_all_ions"] = bool(line.split()[-1])
            elif "fix_all_cell" in line.lower():
                cell["fix_all_cell"] = bool(line.split()[-1])
            elif "fix_vol" in line.lower():
                cell["fix_vol"] = bool(line.split()[-1])
            elif "symmetry_generate" in line.lower():
                cell["symmetry_generate"] = True
            elif "symmetry_tol" in line.lower():
                cell["symmetry_tol"] = f90_float_parse(line.split()[-1])
            elif "snap_to_symmetry" in line.lower():
                cell["snap_to_symmetry"] = True
            elif "quantisation_axis" in line.lower():
                cell["quantisation_axis"] = list(map(int, line.split()[1:]))
            elif "positions_noise" in line.lower():
                cell["positions_noise"] = f90_float_parse(line.split()[-1])
            elif "cell_noise" in line.lower():
                cell["cell_noise"] = f90_float_parse(line.split()[-1])
            elif (
                "kpoints_path" in line.lower()
                or "kpoint_path" in line.lower()
                and "%block" not in line.lower()
            ):
                if (
                    "spectral_kpoints_path_spacing" in line.lower()
                    or "spectral_kpoint_path_spacing" in line.lower()
                ):
                    cell["spectral_kpoints_path_spacing"] = f90_float_parse(
                        line.split()[-1]
                    )
                if (
                    "bs_kpoints_path_spacing" in line.lower()
                    or "bs_kpoint_path_spacing" in line.lower()
                ):
                    cell["spectral_kpoints_path_spacing"] = f90_float_parse(
                        line.split()[-1]
                    )
                elif (
                    "phonon_fine_kpoints_path_spacing" in line.lower()
                    or "phonon_fine_kpoint_path_spacing" in line.lower()
                ):
                    cell["phonon_fine_kpoint_path_spacing"] = f90_float_parse(
                        line.split()[-1]
                    )
                elif (
                    "kpoints_path_spacing" in line.lower()
                    or "kpoint_path_spacing" in line.lower()
                ):
                    cell["kpoints_path_spacing"] = f90_float_parse(line.split()[-1])

    if "external_pressure" not in cell or not cell["external_pressure"]:
        cell["external_pressure"] = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]

    if lattice:
        if "lattice_cart" not in cell and "lattice_abc" in cell:
            cell["lattice_cart"] = abc2cart(cell["lattice_abc"])
        elif "lattice_cart" in cell and "lattice_abc" not in cell:
            cell["lattice_abc"] = cart2abc(cell["lattice_cart"])
        if "lattice_cart" in cell:
            cell["cell_volume"] = cart2volume(cell["lattice_cart"])

    if positions:
        if "positions_frac" not in cell and "positions_abs" in cell:
            cell["positions_frac"] = cart2frac(
                cell["lattice_cart"], cell["positions_abs"]
            )
            cell["positions_frac"] = wrap_frac_coords(cell["positions_frac"])

    if db:
        for species in cell["species_pot"]:
            if "OTF" in cell["species_pot"][species].upper():
                pspot_seed = (
                    "/".join(fname.split("/")[:-1]) + "/" + cell["species_pot"][species]
                )
                if os.path.isfile(pspot_seed):
                    cell["species_pot"].update(usp2dict(pspot_seed))

    return cell, True


[docs]@scraper_function
def param2dict(fname, db=True, **kwargs):
    """Extract available information from .param file; probably
    to be merged with other dicts from other files.

    Parameters:
        fname (str/list): param filename or list of filenames with or
            without file extension

    Keyword arguments:
        db (bool): if True, only scrape relevant info, otherwise scrape all

    Returns:
        (tuple): containing either dict/str containing data or error, and a bool stating
            if the scrape was successful.

    """
    from matador.utils.castep_params import CASTEP_PARAMS

    flines, fname = get_flines_extension_agnostic(fname, "param")
    param = {}
    param["source"] = [fname]
    # exclude some useless info if importing to db
    scrub_list = [
        "checkpoint",
        "write_bib",
        "mix_history_length",
        "fix_occupancy",
        "page_wvfns",
        "num_dump_cycles",
        "backup_interval",
        "geom_max_iter",
        "fixed_npw",
        "write_cell_structure",
        "bs_write_eigenvalues",
        "calculate_stress",
        "opt_strategy",
        "max_scf_cycles",
    ]
    splitters = [":", "=", "\t", " "]
    unrecognised = []
    devel_lines = []
    for line_no, line in enumerate(flines):
        if "#" or "!" in line:
            line = line.split("#")[0].split("!")[0]
        line = line.lower()
        # skip blank lines and comments
        if line.startswith(("#", "!")) or not line.strip() or line_no in devel_lines:
            continue
        else:
            # if scraping to db, ignore "rubbish"
            if db:
                if [rubbish for rubbish in scrub_list if rubbish in line]:
                    continue
            # read all other parameters in
            for splitter in splitters:
                if splitter in line:
                    if not line.startswith("%"):
                        keyword = line.split(splitter)[0].strip()
                        value = line.split(splitter)[-1].strip()
                        if keyword.lower() not in CASTEP_PARAMS:
                            unrecognised.append(keyword.lower())
                        param[keyword] = value
                    # deal with edge cases
                    if "%block devel_code" in line:
                        i = 1
                        while "%endblock devel_code" not in flines[line_no + i].lower():
                            if i + line_no >= len(flines):
                                raise RuntimeError("Found unclosed %block devel_code.")
                            if "magres" in flines[line_no + i].lower():
                                line = flines[line_no + i].upper()
                            else:
                                line = flines[line_no + i].lower()
                            devel_lines.append(line_no + i)
                            if "devel_code" not in param:
                                param["devel_code"] = ""
                            param["devel_code"] += line
                            i += 1
                        break

                    if "true" in value.lower():
                        param[keyword] = True
                    elif "false" in value.lower():
                        param[keyword] = False

                    if "spin_polarised" in line:
                        param["spin_polarized"] = param["spin_polarised"]
                        if "spin_polarised" in param:
                            del param["spin_polarised"]

                    if keyword == "geom_max_iter":
                        param["geom_max_iter"] = int(param["geom_max_iter"])

                    if "cut_off_energy" in line and "mix_cut_off_energy" not in line:
                        temp_cut_off = param["cut_off_energy"].split()
                        if len(temp_cut_off) > 1:
                            if temp_cut_off[1] == "ev":
                                param["cut_off_energy"] = f90_float_parse(
                                    temp_cut_off[0]
                                )
                            elif db:
                                raise RuntimeError(
                                    "cut_off_energy units must be eV or blank in db mode, not {}".format(
                                        temp_cut_off[1]
                                    )
                                )
                            else:
                                param["cut_off_energy"] = "{} {}".format(
                                    temp_cut_off[0], temp_cut_off[1]
                                )
                        else:
                            param["cut_off_energy"] = f90_float_parse(temp_cut_off[0])
                    # standardize tasks that have alternative spellings
                    elif (
                        "task" in line
                        and "spectral_task" not in line
                        and "magres_task" not in line
                    ):
                        if "singlepoint" in param["task"]:
                            param["task"] = "singlepoint"
                        elif "geometry" in param["task"]:
                            param["task"] = "geometryoptimization"
                    elif "xc_functional" in line:
                        param["xc_functional"] = param["xc_functional"].upper()
                    elif "spin" in line and "polar" not in line and "spin" in param:
                        param["spin"] = int(param["spin"])
                    elif "perc_extra_bands" in line:
                        param["perc_extra_bands"] = f90_float_parse(
                            param["perc_extra_bands"]
                        )
                    elif "geom_force_tol" in line:
                        param["geom_force_tol"] = f90_float_parse(
                            param["geom_force_tol"]
                        )
                    elif "elec_energy_tol" in line:
                        temp = param["elec_energy_tol"].lower().replace("ev", "")
                        temp = temp.strip()
                        param["elec_energy_tol"] = f90_float_parse(temp)
                    break

    if unrecognised:
        raise RuntimeError(
            "Found several unrecognised parameters: {}".format(unrecognised)
        )

    return param, True


[docs]@scraper_function
def castep2dict(fname, db=True, intermediates=False, timings=False, **kwargs):
    """From seed filename, create dict of the most relevant
    information about a calculation.

    Parameters:
        fname (str/list): filename or list of filenames of castep file(s)

    Keyword arguments:
        db (bool): whether to error on missing relaxation info
        intermediates (bool): instead of a single dict containing the relaxed structure
            return a list of snapshots found in .castep file
        timings (bool): Run through the CASTEP file one extra time to calculate total time
            taken.

    Returns:
        (tuple): containing either dict/str containing data or error, and a bool stating
            if the scrape was successful.

    """
    flines, fname = get_flines_extension_agnostic(
        fname, ["castep", "history", "history.gz"]
    )

    castep = dict()
    # set source tag to castep file
    castep["source"] = [fname]

    # grab file owner
    try:
        import pwd

        castep["user"] = pwd.getpwuid(os.stat(fname).st_uid).pw_name
    except Exception:
        castep["user"] = "xxx"

    try:
        get_seed_metadata(castep, fname)
    except Exception as exc:
        warnings.warn(
            f"There was an error scraping provenance from filename {fname}: {exc}"
        )

    # wrangle castep file for parameters in 3 passes:
    # once forwards to get number and types of atoms
    _castep_scrape_atoms(flines, castep)
    # once backwards to get the final parameter set for the calculation
    _castep_scrape_final_parameters(flines, castep)

    # task specific options
    if db and "geometry" not in castep["task"]:
        raise RuntimeError("CASTEP file does not contain GO calculation")

    if not db and "thermo" in castep["task"].lower():
        _castep_scrape_thermo_data(flines, castep)

    if (
        not db
        and "thermo" in castep["task"].lower()
        or "phonon" in castep["task"].lower()
    ):
        _castep_scrape_phonon_frequencies(flines, castep)

    # only scrape snapshots/number of intermediates if requested
    try:
        snapshots, castep["geom_iter"] = _castep_scrape_all_snapshots(
            flines, intermediates=intermediates
        )
        if intermediates:
            castep["intermediates"] = snapshots
    except RuntimeError as exc:
        if intermediates:
            raise RuntimeError("Failed to scrape intermediates: {}".format(exc))

    if timings:
        _castep_scrape_metadata(flines, castep)

    # once more forwards, from the final step, to get the final structure
    _castep_scrape_final_structure(flines, castep, db=db)

    # scrape any BEEF post-processing
    _castep_scrape_beef(flines, castep)

    # scrape any AJM group-specific devel codes
    _castep_scrape_devel_code(flines, castep)

    if "positions_frac" not in castep or not castep["positions_frac"]:
        raise ComputationError("Could not find positions")

    # unfortunately CASTEP does not write forces when there is only one atom
    if (
        "forces" not in castep
        and castep["num_atoms"] == 1
        and "geometry" in castep["task"]
    ):
        castep["forces"] = [[0, 0, 0]]

    # finally check for pseudopotential files if OTF is present in species_pot
    if db:
        for species in castep["species_pot"]:
            if "OTF" in castep["species_pot"][species].upper():
                pspot_seed = str(
                    Path(fname).parent.joinpath(castep["species_pot"][species])
                )
                for globbed in glob.glob(pspot_seed):
                    if os.path.isfile(globbed):
                        castep["species_pot"].update(usp2dict(globbed))

    # check that any optimized results were saved and raise errors if not
    if not castep.get("optimised"):
        castep["optimised"] = False
        if db:
            # if importing to db, skip unconverged structure
            raise DFTError("CASTEP GO failed to converge.")

    return castep, True


[docs]@scraper_function
def bands2dict(fname, **kwargs):
    """Parse a CASTEP bands file into a dictionary, which can be used as input to
    an :obj:`matador.orm.spectral.ElectronicDispersion` object.

    Parameters:
        fname (str/list): filename of list of filenames to be scraped.

    Returns:
        (tuple): containing either dict/str containing data or error, and a bool stating
            if the scrape was successful.

    """
    from matador.utils.chem_utils import HARTREE_TO_EV, BOHR_TO_ANGSTROM

    bs = dict()
    flines, fname = get_flines_extension_agnostic(fname, "bands")

    header = flines[:9]
    data = flines[9:]

    bs["source"] = [fname]

    bs["num_kpoints"] = int(header[0].split()[-1])
    bs["num_spins"] = int(header[1].split()[-1])
    bs["num_electrons"] = f90_float_parse(header[2].split()[-1])
    bs["num_bands"] = int(header[3].split()[-1])

    bs["spin_fermi_energy_Ha"] = [
        f90_float_parse(val) for val in header[4].split()[-bs["num_spins"] :]
    ]
    bs["spin_fermi_energy"] = [
        HARTREE_TO_EV * val for val in bs["spin_fermi_energy_Ha"]
    ]
    bs["fermi_energy"] = np.mean(bs["spin_fermi_energy"])

    bs["lattice_cart"] = []
    for i in range(3):
        bs["lattice_cart"].append(
            [BOHR_TO_ANGSTROM * f90_float_parse(elem) for elem in header[6 + i].split()]
        )
    bs["kpoint_path"] = np.zeros((bs["num_kpoints"], 3))
    bs["kpoint_weights"] = np.zeros((bs["num_kpoints"]))
    bs["eigenvalues_k_s"] = np.empty(
        (bs["num_spins"], bs["num_bands"], bs["num_kpoints"])
    )

    for nk in range(bs["num_kpoints"]):
        kpt_ind = nk * (bs["num_spins"] * bs["num_bands"] + bs["num_spins"] + 1)
        bs["kpoint_path"][int(data[kpt_ind].split()[1]) - 1] = np.asarray(
            [f90_float_parse(elem) for elem in data[kpt_ind].split()[-4:-1]]
        )
        bs["kpoint_weights"][int(data[kpt_ind].split()[1]) - 1] = f90_float_parse(
            data[kpt_ind].split()[-1]
        )
        for nb in range(bs["num_bands"]):
            for ns in range(bs["num_spins"]):
                line_number = kpt_ind + 2 + ns + (ns * bs["num_bands"]) + nb
                bs["eigenvalues_k_s"][ns][nb][
                    int(data[kpt_ind].split()[1]) - 1
                ] = f90_float_parse(data[line_number].strip())

    bs["eigenvalues_k_s"] *= HARTREE_TO_EV
    bs["eigs_s_k"] = bs["eigenvalues_k_s"]

    return bs, True


[docs]@scraper_function
def arbitrary2dict(fname, **kwargs):
    """Read arbitrary CASTEP-style input files into
    a dictionary.

    Parameters:
        fname (str/list): filename or list of filenames.

    Returns:
        (tuple): containing either dict/str containing data or error, and a bool stating
            if the scrape was successful.

    """

    flines, fname = get_flines_extension_agnostic(fname, None)

    splitters = [":", "=", "\t", " "]
    comment_delims = ["!", "%", "#", "/*", "*/"]

    result = {}
    result["source"] = [fname]
    for line in flines:
        comment = False
        for delim in comment_delims:
            if line.strip().startswith(delim):
                comment = True
                break
        if not comment:
            for splitter in splitters:
                if splitter in line:
                    keyword = line.split(splitter)[0].strip()
                    value = line.split(splitter)[-1].strip()
                    result[keyword.lower()] = value
                    break

    return result, True


[docs]@scraper_function
def optados2dict(fname, **kwargs):
    """Scrape optados output file (*.*.dat) or (*.pdos.*.dat)
    for DOS, projectors and projected DOS/dispersion.

    Parameters:
        fname (str/list): optados filename or list of filenames.

    Returns:
        (tuple): containing either dict/str containing data or error, and a bool stating
            if the scrape was successful.

    """
    optados = dict()
    is_pdos = False
    is_pdis = False
    is_spin_dos = False
    dos_unit_label = None

    flines, fname = get_flines_extension_agnostic(fname, None)

    header = []
    header_ind = 0
    for ind, line in enumerate(flines):
        header_ind = ind
        if not line.strip().startswith("#") or "K-point" in line:
            break
        if "Partial" in line:
            is_pdos = True
        elif "spin" in line:
            is_spin_dos = True
        elif "Projected Dispersion" in line:
            is_pdis = True
        elif "DOS (" in line and "Integrated" not in line:
            dos_unit_label = " ".join(line.split()[2:])
        else:
            header.append(line)

    flines = flines[header_ind:]

    if not is_pdis:
        data = np.loadtxt(fname, comments="#")
        optados["energies"] = data[:, 0]

    if is_pdos or is_pdis:
        optados["projectors"] = _optados_get_projector_labels(header)
        optados["num_projectors"] = len(optados["projectors"])

    if dos_unit_label is not None:
        optados["dos_unit_label"] = dos_unit_label

    if is_pdos:
        # get pdos values
        optados["pdos"] = dict()
        optados["sum_pdos"] = np.zeros_like(data[:, 0])
        for i, projector in enumerate(optados["projectors"]):
            # optados spin-down projectors are negative, unfortunately
            if "down" in projector:
                optados["pdos"][projector] = -data[:, i + 1]
            else:
                optados["pdos"][projector] = data[:, i + 1]
            optados["sum_pdos"] += data[:, i + 1]

    elif is_spin_dos:
        optados["spin_dos"] = dict()
        optados["spin_dos"]["up"] = data[:, 1]
        optados["spin_dos"]["down"] = -data[:, 2]
        optados["dos"] = np.abs(optados["spin_dos"]["up"]) + np.abs(
            optados["spin_dos"]["down"]
        )

    elif is_pdis:
        optados["projector_weights"] = []
        optados["kpoints"] = []
        optados["eigenvalues"] = []
        # get kpoints and count number of bands
        kpt_ind = -1
        for i, line in enumerate(flines):
            if "K-point" in line:
                optados["kpoints"].append(
                    [f90_float_parse(val) for val in line.split()[-3:]]
                )
                if kpt_ind == -1:
                    kpt_ind = i
                else:
                    if not optados.get("num_bands"):
                        optados["num_bands"] = i - kpt_ind - 1

        optados["num_kpoints"] = len(optados["kpoints"])

        for nk in range(optados["num_kpoints"]):
            eigs = []
            pdis = []
            for nb in range(0, optados["num_bands"]):
                eigs.append(
                    f90_float_parse(
                        flines[nk * (optados["num_bands"] + 1) + nb + 1].split()[0]
                    )
                )
                pdis.append(
                    [
                        f90_float_parse(val)
                        for val in flines[
                            nk * (optados["num_bands"] + 1) + 1 + nb
                        ].split()[1:]
                    ]
                )
            optados["eigenvalues"].append(eigs)
            optados["projector_weights"].append(pdis)

        optados["projector_weights"] = np.asarray(optados["projector_weights"])

    else:
        optados["dos"] = data[:, 1]

    return optados, True


[docs]@scraper_function
def phonon2dict(fname, **kwargs):
    """Parse a CASTEP phonon file into a dictionary.

    Parameters:
        fname (str/list): phonon filename or list of filenames.

    Returns:
        (tuple): containing either dict/str containing data or error, and a bool stating
            if the scrape was successful.

    """

    flines, fname = get_flines_extension_agnostic(fname, ["phonon", "phonon_dos"])

    ph = dict()
    ph["source"] = [fname]

    dos_present = False
    data_start = 0

    for line_no, line in enumerate(flines):
        line = line.lower()
        if "number of ions" in line:
            ph["num_atoms"] = int(line.split()[-1])
        elif "number of branches" in line:
            ph["num_modes"] = int(line.split()[-1])
            ph["num_branches"] = ph["num_modes"]
        elif "number of wavevectors" in line:
            ph["num_kpoints"] = int(line.split()[-1])
        elif "frequencies in" in line:
            ph["freq_unit"] = line.split()[-1]
        elif "unit cell vectors" in line:
            ph["lattice_cart"] = []
            for i in range(3):
                ph["lattice_cart"].append(
                    [f90_float_parse(elem) for elem in flines[line_no + i + 1].split()]
                )
            ph["lattice_abc"] = cart2abc(ph["lattice_cart"])
        elif "fractional co-ordinates" in line:
            ph["positions_frac"] = []
            ph["atom_types"] = []
            ph["atom_masses"] = []
            i = 1
            while "END header" not in flines[line_no + i]:
                ph["positions_frac"].append(
                    [f90_float_parse(elem) for elem in flines[line_no + i].split()[1:4]]
                )
                ph["atom_types"].append(flines[line_no + i].split()[-2])
                ph["atom_masses"].append(
                    f90_float_parse(flines[line_no + i].split()[-1])
                )
                i += 1
        elif "end header" in line:
            data_start = line_no + 1
        elif "begin dos" in line:
            dos_present = True
            projector_labels = flines[line_no].split()[5:]
            projector_labels = [(label, None, None) for label in projector_labels]
            begin_dos = line_no + 1

        elif "q-pt" in line:
            last_qpt_ind = int(line.split()[1])

    if dos_present:
        # extra header line with GRADIENTS written when dos is present
        data_start += 1
        # no eigenvectors written when dos is present
        line_offset = ph["num_modes"] + 1
    else:
        line_offset = ph["num_modes"] * (ph["num_atoms"] + 1) + 3

    data = flines[data_start:]

    ph["phonon_kpoint_list"] = []
    if "num_kpoints" not in ph:
        ph["num_kpoints"] = last_qpt_ind
    ph["eigenvalues_q"] = np.zeros((1, ph["num_modes"], ph["num_kpoints"]))
    raman_intensity = np.zeros_like(ph["eigenvalues_q"])
    infrared_intensity = np.zeros_like(ph["eigenvalues_q"])
    raman = False
    ir = False
    for qind in range(ph["num_kpoints"]):
        ph["phonon_kpoint_list"].append(
            [f90_float_parse(elem) for elem in data[qind * line_offset].split()[2:]]
        )
        for i in range(1, ph["num_modes"] + 1):
            line_split = data[qind * line_offset + i].split()
            ph["eigenvalues_q"][0][i - 1][qind] = f90_float_parse(line_split[1])
            if len(line_split) > 2:
                infrared_intensity[0][i - 1][qind] = f90_float_parse(line_split[2])
                ir = True
            if len(line_split) > 3:
                raman_intensity[0][i - 1][qind] = f90_float_parse(line_split[3])
                raman = True

    if ir:
        ph["infrared_intensity"] = infrared_intensity
    if raman:
        ph["raman_intensity"] = raman_intensity

    if dos_present:
        # remove header and "END"
        flines = flines[begin_dos:-1]
        raw_data = np.genfromtxt(flines)
        ph["energies"] = raw_data[:, 0] * INVERSE_CM_TO_EV
        ph["dos"] = raw_data[:, 1]
        ph["pdos"] = dict()
        ph["pdos"]["pdos"] = dict()
        ph["pdos"]["projectors"] = []
        ph["pdos"]["energies"] = ph["energies"]

        for i, label in enumerate(projector_labels):
            ph["pdos"]["pdos"][label] = raw_data[:, i + 2]
            ph["pdos"]["projectors"].append(label)

    ph["kpoint_path"] = np.asarray([qpt[0:3] for qpt in ph["phonon_kpoint_list"]])
    ph["kpoint_weights"] = [qpt[3] for qpt in ph["phonon_kpoint_list"]]
    ph["eigenvalues_q"] *= INVERSE_CM_TO_EV
    ph["softest_mode_freq"] = np.min(ph["eigenvalues_q"])

    if ph["softest_mode_freq"] < -1:
        import warnings

        warnings.warn(
            f"File {fname} has negative eigenvalue {ph['softest_mode_freq']}."
        )
    ph["eigs_q"] = ph["eigenvalues_q"]

    return ph, True


[docs]@scraper_function
def phonon_dos2dict(*args, **kwargs):
    """Wrapper for old phonon DOS scraper, which has since been merged
    with `phonon2dict`. Note that this function still has a different
    effect to `phonon2dict` when `as_model` is used as the results will
    be cast into a :class:`VibrationalDOS` object.

    """
    return phonon2dict(*args, no_wrap=True, **kwargs)


[docs]def usp2dict(fname, **kwargs):
    """Extract pseudopotential string from a CASTEP
    OTF .USP file.

    Parameters:
        fname (str/list): filename of usp file, or list of filenames.

    Returns:
        dict: partial species_pot dict from usp file.

    """
    species_pot = dict()
    with open(fname, "r") as f:
        flines = f.readlines()
        for line_no, line in enumerate(flines):
            if "Pseudopotential Report" in line:
                i = 0
                while i + line_no < len(flines) - 3:
                    if "Pseudopotential Report" in flines[line_no + i]:
                        i += 2
                        elem = flines[line_no + i].split(":")[1].split()[0]
                    elif "core correction" in flines[line_no + i]:
                        i += 2
                        species_pot[elem] = flines[line_no + i].strip().split()[1]
                        # check next line for wrapped definition
                        if flines[line_no + i + 1].strip().startswith("--------"):
                            break
                        else:
                            species_pot[elem] += (
                                flines[line_no + i + 1].strip().split()[1]
                            )
                            break
                    i += 1
    species_pot[elem] = species_pot[elem].replace('"', "")
    species_pot[elem] = species_pot[elem].replace("[]", "")
    return species_pot


def _castep_scrape_thermo_data(flines, castep):
    """Scrape the data from a CASTEP Thermodynamics claculation.

    Note:
        This only scrapes from Thermodynamics section currently,
        NOT Atomic Displacement Parameters

    Parameters:
        flines (list): list of lines contained in file
        castep (dict): dictionary to update with data

    Returns:
        dict: dictionary updated with scraped thermodynamics data
              in format doc['energy_type'] = {300:-0.01,200:-0.05}

    """
    for line_no, line in enumerate(flines):
        if "Number of temperature values" in line:
            castep["thermo_num_temp_vals"] = int(line.split(":")[-1].strip())
        elif "Initial temperature" in line:
            castep["thermo_temp_init"] = f90_float_parse(
                line.split(":")[1].strip().split(" ")[0]
            )
        elif "Final temperature" in line:
            castep["thermo_temp_final"] = f90_float_parse(
                line.split(":")[1].strip().split(" ")[0]
            )
        elif "Spacing between temperature values" in line:
            castep["thermo_temp_spacing"] = f90_float_parse(
                line.split(":")[1].strip().split(" ")[0]
            )
        elif "Zero-point energy" in line:
            castep["thermo_zero_point_energy"] = f90_float_parse(
                line.split("=")[1].strip().split(" ")[0]
            )
        elif "T(K)" and "E(eV)" in line:
            castep["thermo_temps"] = []  # temperatures calculation was done at
            castep["thermo_enthalpy"] = {}  # enthalpy E(eV)
            castep["thermo_free_energy"] = {}  # free energy F(eV)
            castep["thermo_entropy"] = {}  # entropy S(J/mol/K)
            castep["thermo_heat_cap"] = {}  # heat capacity Cv(J/mol/K)
            i = 2
            while True:
                if not flines[line_no + i + 1].strip():
                    break
                else:
                    temp_line = flines[line_no + i].split()
                    castep["thermo_temps"].append(f90_float_parse(temp_line[0]))
                    castep["thermo_enthalpy"][
                        f90_float_parse(temp_line[0])
                    ] = f90_float_parse(temp_line[1])
                    castep["thermo_free_energy"][
                        f90_float_parse(temp_line[0])
                    ] = f90_float_parse(temp_line[2])
                    castep["thermo_entropy"][
                        f90_float_parse(temp_line[0])
                    ] = f90_float_parse(temp_line[3])
                    castep["thermo_heat_cap"][
                        f90_float_parse(temp_line[0])
                    ] = f90_float_parse(temp_line[4])
                i += 1


def _castep_scrape_phonon_frequencies(flines, castep):
    """Iterate through flines to scrape the phonon frequencies
    from this CASTEP calculation. Will only scrape the *final* set
    of frequencies in the CASTEP file, ignoring any others.

    """
    phonons = {}

    phonons["phonon_fine_kpoint_list"] = []
    phonons["phonon_fine_kpoint_weights"] = []
    phonons["eigs_q"] = []

    # first find the last block of frequencies
    for line_no, line in enumerate(flines):
        if "Performing frequency calculation at " in line:
            start_line_no = line_no + 2

    q_pt_ind = 0
    for line_no, line in enumerate(flines[start_line_no:]):
        if "============" in line:
            break
        if "q-pt=" in line:
            q_pt = [
                f90_float_parse(val)
                for val in line.split("(")[-1].split(")")[0].split()
            ]
            phonons["phonon_fine_kpoint_list"].append(q_pt)
            phonons["phonon_fine_kpoint_weights"].append(
                f90_float_parse(line.split()[-2])
            )
            phonons["eigs_q"].append([])

            for _, freq_line in enumerate(flines[start_line_no:][line_no + 6 :]):
                if "........................." in freq_line:
                    break
                phonons["eigs_q"][q_pt_ind].append(
                    f90_float_parse(freq_line.split()[2])
                )
            q_pt_ind += 1

    phonons["num_modes"] = len(phonons["eigs_q"][0])
    phonons["eigs_q"] = np.asarray(phonons["eigs_q"]).T
    phonons["eigs_q"] = INVERSE_CM_TO_EV * phonons["eigs_q"].reshape(
        1, *np.shape(phonons["eigs_q"])
    )
    phonons["kpoint_path"] = phonons["phonon_fine_kpoint_list"]
    phonons["kpoint_weights"] = phonons["phonon_fine_kpoint_weights"]
    phonons["num_kpoints"] = len(phonons["phonon_fine_kpoint_list"])
    phonons["num_qpoints"] = len(phonons["phonon_fine_kpoint_list"])
    phonons["softest_mode_freq"] = np.min(phonons["eigs_q"])
    if phonons["softest_mode_freq"] < -1:
        import warnings

        warnings.warn(
            f"File {castep['source'][0]} has negative eigenvalue {phonons['softest_mode_freq']}."
        )

    for key in phonons:
        castep[key] = phonons[key]


def _castep_scrape_atoms(flines, castep):
    """Iterate forwards through flines to scrape atomic types and
    initial positions, and get a preliminary lattice.

    Parameters:
        flines (list): list of lines in file
        castep (dict): dictionary to update with scraped data

    Returns:
        dict: dictionary updated with scraped data

    """
    for line_no, line in enumerate(flines):
        if "Real Lattice" in line:
            castep["lattice_cart"] = []
            i = 1
            while True:
                if not flines[line_no + i].strip():
                    break
                else:
                    temp_line = flines[line_no + i].split()[0:3]
                    castep["lattice_cart"].append(list(map(f90_float_parse, temp_line)))
                i += 1
        if "Lattice parameters" in line:
            castep["lattice_abc"] = []
            i = 1
            castep["lattice_abc"].append(
                list(
                    map(
                        f90_float_parse,
                        [
                            flines[line_no + i].split("=")[1].strip().split(" ")[0],
                            flines[line_no + i + 1].split("=")[1].strip().split(" ")[0],
                            flines[line_no + i + 2].split("=")[1].strip().split(" ")[0],
                        ],
                    )
                )
            )
            castep["lattice_abc"].append(
                list(
                    map(
                        f90_float_parse,
                        [
                            flines[line_no + i].split("=")[-1].strip(),
                            flines[line_no + i + 1].split("=")[-1].strip(),
                            flines[line_no + i + 2].split("=")[-1].strip(),
                        ],
                    )
                )
            )
        if "Current cell volume" in line:
            castep["cell_volume"] = f90_float_parse(
                line.split("=")[1].split()[0].strip()
            )
        if "atom types" not in castep and "Cell Contents" in line:
            castep["atom_types"] = []
            castep["positions_frac"] = []
            i = 1
            atoms = False
            while True:
                if atoms:
                    if "xxxxxxxxx" in flines[line_no + i]:
                        atoms = False
                        break
                    else:
                        castep["atom_types"].append(flines[line_no + i].split()[1])
                        castep["positions_frac"].append(
                            list(
                                map(f90_float_parse, (flines[line_no + i].split()[3:6]))
                            )
                        )
                if "x------" in flines[line_no + i]:
                    atoms = True
                i += 1
            for ind, pos in enumerate(castep["positions_frac"]):
                for k in range(3):
                    if pos[k] > 1 or pos[k] < 0:
                        castep["positions_frac"][ind][k] %= 1
            castep["num_atoms"] = len(castep["atom_types"])
            castep["stoichiometry"] = get_stoich(castep["atom_types"])
            castep["num_fu"] = castep["num_atoms"] / sum(
                [elem[1] for elem in castep["stoichiometry"]]
            )
            break
    else:
        raise ComputationError("Unable to find atoms in CASTEP file.")


def _castep_scrape_final_parameters(flines, castep):
    """Scrape the DFT parameters and metadata from a CASTEP file,
    using those listed last in the file
    (i.e. those used to make the final structure).

    Parameters:
        flines (list): list of lines in the file
        castep (dict): dictionary in which to put scraped data

    Returns:
        dict: dictionary updated with scraped data

    """

    pspot_report_dict = dict()
    for line_no, line in enumerate(reversed(flines)):
        line_no = len(flines) - 1 - line_no
        if "castep_version" not in castep and "Release CASTEP version" in line:
            # version is first thing printed in file, so break once this is read
            castep["castep_version"] = line.replace("|", "").split()[-1]
            break
        elif "date" not in castep and "Run started:" in line:
            try:
                year = line.split()[5]
                month = str(strptime(line.split()[4], "%b").tm_mon)
                day = line.split()[3]
                castep["date"] = day + "-" + month + "-" + year
            except (IndexError, ValueError):
                castep["date"] = "unknown"
        elif "_compiler_architecture" not in castep and "compiled for" in line.lower():
            try:
                castep["_compiler_architecture"] = line.split()[2]
            except IndexError:
                castep["_compiler_architecture"] = "unknown"
        elif "_castep_commit" not in castep and " from code version" in line.lower():
            try:
                castep["_castep_commit"] = " ".join(
                    flines[line_no : line_no + 2]
                ).split()[3]
            except (IndexError, ValueError):
                castep["_castep_commit"] = "unknown"
        elif "task" not in castep and "type of calculation" in line:
            castep["task"] = line.split(":")[-1].strip().replace(" ", "")
        elif "xc_functional" not in castep and "functional" in line:
            # convert from .castep file xc_functional to param style
            xc_string = line.split(":")[-1].strip()
            if "Local Density Approximation" in xc_string:
                castep["xc_functional"] = "LDA"
            elif "Perdew Burke Ernzerhof" in xc_string:
                castep["xc_functional"] = "PBE"
            elif "PBE for solids" in xc_string:
                castep["xc_functional"] = "PBESol"
            elif "hybrid B3LYP" in xc_string:
                castep["xc_functional"] = "B3LYP"
            elif "hybrid HSE03" in xc_string:
                castep["xc_functional"] = "HSE03"
            elif "hybrid HSE06" in xc_string:
                castep["xc_functional"] = "HSE06"
            elif "RSCAN" in xc_string:
                castep["xc_functional"] = "RSCAN"
            elif "Screened Hartree-Fock" in xc_string:
                castep["xc_functional"] = "SHF-LDA"
            else:
                castep["xc_functional"] = xc_string.split(":")[-1]
                warnings.warn(
                    "Unrecognised functional {xc}: scraping as {xc}."
                    "This may lead to incompatible param files.".format(
                        xc=castep["xc_functional"]
                    )
                )
        elif "cut_off_energy" not in castep and "plane wave basis set" in line:
            castep["cut_off_energy"] = f90_float_parse(line.split(":")[-1].split()[0])
        elif (
            "finite_basis_corr" not in castep
            and "finite basis set correction  " in line
        ):
            castep["finite_basis_corr"] = line.split(":")[-1].strip()
        elif "kpoints_mp_grid" not in castep and "MP grid size for SCF" in line:
            castep["kpoints_mp_grid"] = list(
                map(int, list(line.split("is")[-1].split()))
            )
        elif "num_kpoints" not in castep and "Number of kpoints used" in line:
            castep["num_kpoints"] = int(line.split()[-1])
        elif "geom_force_tol" not in castep and "max ionic |force| tolerance" in line:
            castep["geom_force_tol"] = f90_float_parse(line.split()[-2])
        elif (
            "elec_energy_tol" not in castep
            and "total energy / atom convergence tol" in line
        ):
            castep["elec_energy_tol"] = f90_float_parse(line.split()[-2])
        elif (
            "sedc_apply" not in castep
            and "DFT+D: Semi-empirical dispersion correction    : on" in line
        ):
            castep["sedc_apply"] = True
            castep["sedc_scheme"] = flines[line_no + 1].split(":")[1].split()[0]
        elif "space_group" not in castep and "Space group of crystal" in line:
            space_group = line.split(":")[-1].split(",")[0].strip().replace(" ", "")
            if space_group:
                castep["space_group"] = space_group
        elif "nelectrons" not in castep and "number of  electrons" in line:
            castep["nelectrons"] = f90_float_parse(line.split(":")[-1])
        elif "nelectrons" not in castep and "number of bands" in line:
            castep["nbands"] = int(line.split(":")[-1])
        elif "Cell constraints are" in line and "cell_constraints" not in castep:
            castep["cell_constraints"] = [
                int(val) for val in line.split(":")[-1].split()
            ]
            if castep["cell_constraints"] == [1, 2, 3, 4, 5, 6]:
                del castep["cell_constraints"]
            elif all(val == 0 for val in castep["cell_constraints"]):
                castep["fix_all_cell"] = True
                del castep["cell_constraints"]
            if "cell_constraints" in castep:
                if len(castep["cell_constraints"]) != 6:
                    raise RuntimeError("Unable to read cell constraints block.")
                castep["cell_constraints"] = [
                    castep["cell_constraints"][0:3],
                    castep["cell_constraints"][3:],
                ]
        elif "external_pressure" not in castep and "External pressure/stress" in line:
            try:
                castep["external_pressure"] = []
                for i in range(3):
                    parsed_line = list(
                        map(f90_float_parse, flines[line_no + i + 1].split())
                    )
                    if len(parsed_line) != 3:
                        parsed_line = i * [0.0] + parsed_line
                    castep["external_pressure"].append(parsed_line)
            except ValueError:
                castep["external_pressure"] = [
                    [0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0],
                ]
        elif (
            "spin_polarized" not in castep
            and "treating system as spin-polarized" in line
        ):
            castep["spin_polarized"] = True
        elif "hubbard_u" not in castep and "Hubbard U values are eV" in line:
            castep["hubbard_u"] = defaultdict(list)
            n_lines_header = 5
            i = 0
            while i < castep["num_atoms"]:
                line = flines[line_no + i + n_lines_header].strip()
                atom = line.split()[0].replace("|", "")
                shifts = list(map(f90_float_parse, line.split()[-5:-1]))
                for ind, shift in enumerate(shifts):
                    if shift != 0:
                        if atom not in castep["hubbard_u"]:
                            castep["hubbard_u"][atom] = dict()
                        if ind == 0:
                            orbital = "s"
                        elif ind == 1:
                            orbital = "p"
                        elif ind == 2:
                            orbital = "d"
                        elif ind == 3:
                            orbital = "f"
                        castep["hubbard_u"][atom][orbital] = shift
                i += 1
        elif "Pseudopotential Report" in line:
            if "species_pot" not in castep:
                castep["species_pot"] = dict()
            i = 0
            while i + line_no < len(flines) - 3:
                if "Pseudopotential Report" in flines[line_no + i]:
                    i += 2
                    elem = flines[line_no + i].split(":")[1].split()[0]

                elif "core correction" in flines[line_no + i]:
                    i += 2
                    if not pspot_report_dict.get(elem):
                        castep["species_pot"][elem] = (
                            flines[line_no + i].split('"')[1].replace("[]", "")
                        )
                        pspot_report_dict[elem] = True
                i += 1
        elif "species_pot" not in castep and "Files used for pseudopotentials" in line:
            if "species_pot" not in castep:
                castep["species_pot"] = dict()
            i = 1
            while True:
                if not flines[line_no + i].strip():
                    break
                else:
                    elem = flines[line_no + i].split()[0].strip()
                    if not pspot_report_dict.get(elem):
                        castep["species_pot"][elem] = (
                            flines[line_no + i].split()[1].split("/")[-1]
                        )
                        if castep["species_pot"][elem] == "Pseudopotential":
                            castep["species_pot"][elem] = (
                                flines[line_no + i].split()[0].strip()
                            )
                            castep["species_pot"][elem] += "_OTF.usp"
                        pspot_report_dict[elem] = False
                i += 1
        elif "peak_mem_MB" not in castep and "Peak Memory Use" in line:
            try:
                castep["peak_mem_MB"] = int(f90_float_parse(line.split()[-2]) / 1024)
            except (IndexError, ValueError):
                castep["peak_mem_MB"] = -1
        elif (
            "estimated_mem_per_process_MB" not in castep
            and "total storage required per process" in line
        ):
            try:
                castep["estimated_mem_per_process_MB"] = f90_float_parse(
                    line.split()[-5]
                )
            except (IndexError, ValueError):
                castep["estimated_mem_per_process_MB"] = 0
        elif (
            "num_mpi_processes" not in castep
            and "Calculation parallelised over" in line
        ):
            try:
                castep["num_mpi_processes"] = int(f90_float_parse(line.split()[3]))
            except Exception:
                castep["num_mpi_processes"] = 1
        elif "num_mpi_processes" not in castep and "Calculation not parall" in line:
            castep["num_mpi_processes"] = 1
        elif "_xc_beef" not in castep and "BEEF completed" in line:
            castep["_xc_beef"] = True

    # write zero pressure if not found in file
    if "external_pressure" not in castep:
        castep["external_pressure"] = [
            [0.0, 0.0, 0.0],
            [0.0, 0.0, 0.0],
            [0.0, 0.0, 0.0],
        ]
    if "spin_polarized" not in castep:
        castep["spin_polarized"] = False
    if "num_mpi_processes" in castep and "estimated_mem_per_process_MB" in castep:
        castep["estimated_mem_MB"] = (
            castep["estimated_mem_per_process_MB"] * castep["num_mpi_processes"]
        )


def _castep_scrape_final_structure(flines, castep, db=True):
    """Scrape final structure from CASTEP file.

    Parameters:
        flines (list): list of lines contained in file
        castep (dict): dictionary to update with data

    Keyword arguments:
        db (bool): whether to enforce database style, e.g. geometry optimisation only

    Returns:
        dict: dictionary updated with scraped data

    """
    if "task" in castep and castep["task"].strip() == "geometryoptimization":
        castep["optimised"] = False
        finish_line, castep["optimised"] = _castep_find_final_structure(flines)
    elif not db:
        finish_line = 0

    final_flines = flines[finish_line + 1 :]
    for line_no, line in enumerate(final_flines):
        if "Real Lattice" in line:
            castep["lattice_cart"] = []
            i = 1
            while i < 4:
                if not final_flines[line_no + i].strip():
                    break
                else:
                    temp_line = final_flines[line_no + i].split()[0:3]
                    castep["lattice_cart"].append(list(map(f90_float_parse, temp_line)))
                i += 1
        if "Lattice parameters" in line:
            castep["lattice_abc"] = []
            i = 1
            castep["lattice_abc"].append(
                list(
                    map(
                        f90_float_parse,
                        [
                            final_flines[line_no + i]
                            .split("=")[1]
                            .strip()
                            .split(" ")[0],
                            final_flines[line_no + i + 1]
                            .split("=")[1]
                            .strip()
                            .split(" ")[0],
                            final_flines[line_no + i + 2]
                            .split("=")[1]
                            .strip()
                            .split(" ")[0],
                        ],
                    )
                )
            )
            castep["lattice_abc"].append(
                list(
                    map(
                        f90_float_parse,
                        [
                            final_flines[line_no + i].split("=")[-1].strip(),
                            final_flines[line_no + i + 1].split("=")[-1].strip(),
                            final_flines[line_no + i + 2].split("=")[-1].strip(),
                        ],
                    )
                )
            )
        elif "Current cell volume" in line:
            castep["cell_volume"] = f90_float_parse(
                line.split("=")[1].split()[0].strip()
            )
        elif "Cell Contents" in line:
            castep["positions_frac"] = []
            i = 1
            atoms = False
            while True:
                if atoms:
                    if "xxxxxxxxx" in final_flines[line_no + i]:
                        atoms = False
                        break
                    else:
                        temp_frac = final_flines[line_no + i].split()[3:6]
                        castep["positions_frac"].append(
                            list(map(f90_float_parse, temp_frac))
                        )
                if "x------" in final_flines[line_no + i]:
                    atoms = True
                i += 1
        # don't check if final_energy exists, as this will update for each GO step
        elif "Final energy =" in line or "Final energy, E" in line:
            castep["total_energy"] = f90_float_parse(line.split("=")[1].split()[0])
            castep["total_energy_per_atom"] = (
                castep["total_energy"] / castep["num_atoms"]
            )
        elif "Final free energy" in line:
            castep["smeared_free_energy"] = f90_float_parse(
                line.split("=")[1].split()[0]
            )
            castep["smeared_free_energy_per_atom"] = (
                castep["smeared_free_energy"] / castep["num_atoms"]
            )
        elif "0K energy" in line:
            castep["0K_energy"] = f90_float_parse(line.split("=")[1].split()[0])
            castep["0K_energy_per_atom"] = castep["0K_energy"] / castep["num_atoms"]
        elif "(SEDC) Total Energy Correction" in line:
            castep["dispersion_correction_energy"] = f90_float_parse(
                line.split(":")[1].split()[0]
            )
        elif "Dispersion corrected final energy" in line:
            castep["dispersion_corrected_energy"] = f90_float_parse(
                line.split("=")[1].split()[0]
            )
            castep["dispersion_corrected_energy_per_atom"] = (
                castep["dispersion_corrected_energy"] / castep["num_atoms"]
            )
        elif "Dispersion corrected final free energy" in line:
            castep["dispersion_corrected_free_energy"] = f90_float_parse(
                line.split("=")[1].split()[0]
            )
            castep["dispersion_corrected_free_energy_per_atom"] = (
                castep["dispersion_corrected_free_energy"] / castep["num_atoms"]
            )
        elif "Dispersion corrected est. 0K energy" in line:
            castep["dispersion_corrected_0K_energy"] = f90_float_parse(
                line.split("=")[1].split()[0]
            )
            castep["dispersion_corrected_0K_energy_per_atom"] = (
                castep["dispersion_corrected_0K_energy"] / castep["num_atoms"]
            )
        elif " Forces **" in line:
            if "Unconstrained" in line:
                force_key = "forces"
            elif "Constrained" in line:
                force_key = "constrained_forces"
            else:
                force_key = "forces"
            castep[force_key] = []
            i = 1
            forces = False
            while True:
                if forces:
                    if "*" in final_flines[line_no + i].split()[1]:
                        forces = False
                        break
                    else:
                        castep[force_key].append([])
                        for j in range(3):
                            temp = final_flines[line_no + i].replace("(cons'd)", "")
                            castep[force_key][-1].append(
                                f90_float_parse(temp.split()[3 + j])
                            )
                elif "x" in final_flines[line_no + i]:
                    i += 1  # skip next blank line
                    forces = True
                i += 1
            if not force_key == "constrained_forces":
                castep["max_force_on_atom"] = np.max(
                    np.linalg.norm(castep[force_key], axis=-1)
                )
        elif "Stress Tensor" in line:
            if "Constrained" in line:
                stress_key = "constrained_stress"
            else:
                stress_key = "stress"
            i = 1
            while i < 20:
                if "Cartesian components" in final_flines[line_no + i]:
                    castep[stress_key] = []
                    for j in range(3):
                        castep[stress_key].append(
                            list(
                                map(
                                    f90_float_parse,
                                    (final_flines[line_no + i + j + 4].split()[2:5]),
                                )
                            )
                        )
                elif "Pressure" in final_flines[line_no + i]:
                    try:
                        castep["pressure"] = f90_float_parse(
                            final_flines[line_no + i].split()[-2]
                        )
                    except ValueError:
                        pass
                    break
                i += 1
        elif "Integrated Spin Density" in line:
            castep["integrated_spin_density"] = f90_float_parse(line.split()[-2])
        elif "Integrated |Spin Density|" in line:
            castep["integrated_mod_spin_density"] = f90_float_parse(line.split()[-2])
        elif "Atomic Populations (Mulliken)" in line:
            # population format seems to change every CASTEP version...
            if float(castep.get("castep_version", 0.0)) >= 18:
                if castep["spin_polarized"]:
                    castep["mulliken_spins"] = []
                    castep["mulliken_net_spin"] = 0.0
                    castep["mulliken_abs_spin"] = 0.0
                castep["mulliken_charges"] = []
                castep["mulliken_spins"] = []
                i = 0
                ind = 0
                while ind < len(castep["atom_types"]):
                    if castep["spin_polarized"]:
                        castep["mulliken_charges"].append(
                            f90_float_parse(final_flines[line_no + i + 4].split()[-2])
                        )
                        castep["mulliken_spins"].append(
                            f90_float_parse(final_flines[line_no + i + 4].split()[-1])
                        )
                        castep["mulliken_net_spin"] += castep["mulliken_spins"][-1]
                        castep["mulliken_abs_spin"] += abs(castep["mulliken_spins"][-1])
                        i += 2
                    else:
                        castep["mulliken_charges"].append(
                            f90_float_parse(final_flines[line_no + i + 4].split()[-1])
                        )
                        i += 1
                    ind += 1
        elif "Final Enthalpy" in line:
            castep["enthalpy"] = f90_float_parse(line.split("=")[-1].split()[0])
            castep["enthalpy_per_atom"] = (
                f90_float_parse(line.split("=")[-1].split()[0]) / castep["num_atoms"]
            )
        elif "Final bulk modulus" in line:
            try:
                castep["bulk_modulus"] = f90_float_parse(line.split("=")[-1].split()[0])
            except ValueError:
                # the above will fail if bulk modulus was not printed (i.e. if it was unchanged)
                pass

        elif (
            "Chemical Shielding and Electric Field Gradient Tensors".lower()
            in line.lower()
        ):
            i = 5
            castep["chemical_shifts"] = []
            while True:
                # break when the line containing just '=' is reached
                if len(flines[line_no + i].split()) == 1:
                    break
                castep["chemical_shifts"].append(flines[line_no + i].split()[3])
                i += 1
            if len(castep["chemical_shifts"]) != len(castep["atom_types"]):
                raise RuntimeError(
                    "Found fewer chemical shifts than atoms (or vice versa)!"
                )

    # calculate kpoint spacing if not found
    if (
        "kpoints_mp_grid" in castep
        and "kpoints_mp_spacing" not in castep
        and "lattice_cart" in castep
    ):
        castep["kpoints_mp_spacing"] = calc_mp_spacing(
            castep["lattice_cart"], castep["kpoints_mp_grid"], prec=4
        )
    return castep


def _castep_scrape_metadata(flines, castep):
    """Scrape addtitional timing metadata from CASTEP file.

    Parameters:
        flines (list): list of lines contained in file
        castep (dict): dictionary to update with data

    Returns:
        dict: dictionary updated with scraped data

    """
    # computing metadata, i.e. parallelism, time, memory, version
    if "total_time_secs" not in castep:
        castep["total_time_secs"] = 0
    if "total_time_hrs" not in castep:
        castep["total_time_hrs"] = 0

    for ind, line in enumerate(flines):
        if "Total time" in line and "matrix elements" not in line:
            try:
                time = f90_float_parse(line.split()[-2])
                castep["total_time_secs"] += time
                castep["total_time_hrs"] += time / 3600
            except (IndexError, ValueError):
                castep["final_calculation_time_secs"] = 0
        elif "Calculation only took" in line:
            castep["_time_estimated"] = f90_float_parse(line.split()[4])


def _castep_find_final_structure(flines):
    """Search for info on final structure in .castep file.

    Parameters:
        flines (list): list of lines in file.

    Returns:
        int: line number in file where total energy of final structure is printed.

    """
    optimised = False
    finish_line = 0
    success_string = "Geometry optimization completed successfully"
    failure_string = "Geometry optimization failed to converge after"
    annoying_string = "WARNING - there is nothing to optimise - skipping relaxation"
    # look for final "success/failure" string in file for geometry optimisation
    for line_no, line in enumerate(reversed(flines)):
        if success_string in line:
            finish_line = len(flines) - line_no
            optimised = True
            break
        if annoying_string in line:
            finish_line = len(flines) - line_no
            optimised = True
            break
        if failure_string in line:
            finish_line = len(flines) - line_no
            optimised = False
            break

    # now wind back to get final total energies and non-symmetrised forces
    for count, line in enumerate(reversed(flines[:finish_line])):
        if "Final energy, E" in line or "Final energy =" in line:
            finish_line -= count + 2
            break

    return finish_line, optimised


def _castep_finalize_snapshot(snapshot: dict, intermediates: list) -> None:
    """Add per-atom keys to snapshot and append it to the intermediates list.

    Parameters:
        snapshot: The document containing the current snapshot.
        intermediates: The list of snapshots so far.

    """
    # if positions frac or lattice didn't change (and thus weren't printed, use the last value)
    if "positions_frac" not in snapshot:
        snapshot["positions_frac"] = intermediates[-1]["positions_frac"]
        snapshot["atom_types"] = intermediates[-1]["atom_types"]
        snapshot["num_atoms"] = len(snapshot["positions_frac"])
    if "lattice_cart" not in snapshot:
        snapshot["lattice_cart"] = intermediates[-1]["lattice_cart"]
        snapshot["lattice_abc"] = intermediates[-1]["lattice_abc"]

    snapshot["smeared_free_energy_per_atom"] = (
        snapshot["smeared_free_energy"] / snapshot["num_atoms"]
    )
    snapshot["0K_energy_per_atom"] = snapshot["0K_energy"] / snapshot["num_atoms"]
    snapshot["total_energy_per_atom"] = snapshot["total_energy"] / snapshot["num_atoms"]
    # handle single atom forces edge-case
    if snapshot["num_atoms"] == 1:
        snapshot["forces"] = [[0, 0, 0]]

    intermediates.append(snapshot)


def _castep_scrape_all_snapshots(flines, intermediates=False):
    """Scrape all intermediate structures from a CASTEP file, both
    geometry optimisation snapshots, and repeated SCF calculations.

    Parameters:
        flines (list): list of lines of file.
        intermediates (bool): whether to save the snapshots or just
            count them.

    Returns:
        :obj:`list` of :obj:`dict`: list of dictionaries containing
            intermediate snapshots (will be empty if `intermediates`
            is false).
        int: number of completed geometry optimisation steps.

    """
    intermediates_list = []
    num_opt_steps = 0
    snapshot = dict()
    if intermediates:
        for line_no, line in enumerate(flines):
            try:
                if "Cell Contents" in line or "Unit Cell" in line:
                    if "total_energy" in snapshot:
                        _castep_finalize_snapshot(snapshot, intermediates_list)
                        snapshot = dict()

                if "Cell Contents" in line:
                    snapshot["positions_frac"] = []
                    snapshot["atom_types"] = []
                    i = 1
                    atoms = False
                    while True:
                        if atoms:
                            if "xxxxxxxxx" in flines[line_no + i]:
                                atoms = False
                                break
                            else:
                                temp_frac = flines[line_no + i].split()[3:6]
                                snapshot["positions_frac"].append(
                                    list(map(f90_float_parse, temp_frac))
                                )
                                snapshot["atom_types"].append(
                                    flines[line_no + i].split()[1]
                                )
                        if "x------" in flines[line_no + i]:
                            atoms = True
                        i += 1
                    for ind, pos in enumerate(snapshot["positions_frac"]):
                        for k in range(3):
                            if pos[k] > 1 or pos[k] < 0:
                                snapshot["positions_frac"][ind][k] %= 1

                    snapshot["num_atoms"] = len(snapshot["positions_frac"])
                    snapshot["stoichiometry"] = get_stoich(snapshot["atom_types"])

                elif "Real Lattice" in line:
                    snapshot["lattice_cart"] = []
                    i = 1
                    while True:
                        if not flines[line_no + i].strip():
                            break
                        else:
                            temp_line = flines[line_no + i].split()[0:3]
                            snapshot["lattice_cart"].append(
                                list(map(f90_float_parse, temp_line))
                            )
                        i += 1

                elif "Lattice parameters" in line:
                    snapshot["lattice_abc"] = []
                    i = 1
                    snapshot["lattice_abc"].append(
                        list(
                            map(
                                f90_float_parse,
                                [
                                    flines[line_no + i]
                                    .split("=")[1]
                                    .strip()
                                    .split(" ")[0],
                                    flines[line_no + i + 1]
                                    .split("=")[1]
                                    .strip()
                                    .split(" ")[0],
                                    flines[line_no + i + 2]
                                    .split("=")[1]
                                    .strip()
                                    .split(" ")[0],
                                ],
                            )
                        )
                    )
                    snapshot["lattice_abc"].append(
                        list(
                            map(
                                f90_float_parse,
                                [
                                    flines[line_no + i].split("=")[-1].strip(),
                                    flines[line_no + i + 1].split("=")[-1].strip(),
                                    flines[line_no + i + 2].split("=")[-1].strip(),
                                ],
                            )
                        )
                    )

                elif "Current cell volume" in line:
                    snapshot["cell_volume"] = f90_float_parse(
                        line.split("=")[1].split()[0].strip()
                    )
                elif "Final energy, E" in line or "Final energy =" in line:
                    snapshot["total_energy"] = f90_float_parse(
                        line.split("=")[1].split()[0]
                    )
                elif "Final free energy" in line:
                    snapshot["smeared_free_energy"] = f90_float_parse(
                        line.split("=")[1].split()[0]
                    )
                elif "0K energy" in line:
                    snapshot["0K_energy"] = f90_float_parse(
                        line.split("=")[1].split()[0]
                    )
                elif " Forces **" in line:
                    if "Unconstrained" in line:
                        force_key = "forces"
                    elif "Constrained" in line:
                        force_key = "constrained_forces"
                    else:
                        force_key = "forces"
                    snapshot[force_key] = []
                    i = 1
                    max_force = 0
                    forces = False
                    while True:
                        if forces:
                            if "*" in flines[line_no + i].split()[1]:
                                forces = False
                                break
                            else:
                                force_on_atom = 0
                                snapshot[force_key].append([])
                                for j in range(3):
                                    temp = flines[line_no + i].replace("(cons'd)", "")
                                    force_on_atom += (
                                        f90_float_parse(temp.split()[3 + j]) ** 2
                                    )
                                    snapshot[force_key][-1].append(
                                        f90_float_parse(temp.split()[3 + j])
                                    )
                                if force_on_atom > max_force:
                                    max_force = force_on_atom
                        elif "x" in flines[line_no + i]:
                            i += 1  # skip next blank line
                            forces = True
                        i += 1
                    if not force_key == "constrained_forces":
                        snapshot["max_force_on_atom"] = pow(max_force, 0.5)

                elif "Stress Tensor" in line:
                    if "Constrained" in line:
                        stress_key = "constrained_stress"
                    else:
                        stress_key = "stress"
                    i = 1
                    while i < 20:
                        if "Cartesian components" in flines[line_no + i]:
                            snapshot[stress_key] = []
                            for j in range(3):
                                snapshot[stress_key].append(
                                    list(
                                        map(
                                            f90_float_parse,
                                            (flines[line_no + i + j + 4].split()[2:5]),
                                        )
                                    )
                                )
                        elif "Pressure" in flines[line_no + i]:
                            snapshot["pressure"] = f90_float_parse(
                                flines[line_no + i].split()[-2]
                            )
                            break
                        i += 1
                # use only finished iterations for counting number of complete GO steps
                elif ": finished iteration" in line and "with enthalpy" in line:
                    # don't include the "zeroth" step before anything has been moved
                    if "0" not in line.split():
                        num_opt_steps += 1
            except Exception as exc:
                raise RuntimeError from exc
    else:
        for line_no, line in enumerate(flines):
            # use only finished iterations for counting number of complete GO steps
            if ": finished iteration" in line and "with enthalpy" in line:
                # don't include the "zeroth" step before anything has been moved
                if "0" not in line.split():
                    num_opt_steps += 1

    return intermediates_list, num_opt_steps


def _castep_scrape_beef(flines, castep):
    """Scrape the Bayesian error estimate output from CASTEP, storing
    it under the `_beef` key.

    Parameters:
        flines (list): CASTEP output flines to scrape.
        castep (dict): dictionary to update with BEEF output.

    """
    if castep.get("_xc_beef"):
        from matador.utils.chem_utils import HARTREE_TO_EV

        for line_no, line in enumerate(flines):
            if "Bayesian Error Estimate (BEE)" in line.strip():
                beef_start = line_no
                break
        else:
            return
    else:
        return

    castep["_beef"] = {
        "thetas": [],
        "xc_energy": [],
        "total_energy": [],
        "total_energy_per_atom": [],
        "xc_energy_per_atom": [],
    }

    for line_no, line in enumerate(flines[beef_start:]):
        if "<-- BEEF" in line:
            castep["_beef"]["thetas"].append(
                [f90_float_parse(val) for val in line.strip().split()[1:4]]
            )
            castep["_beef"]["xc_energy"].append(
                HARTREE_TO_EV * f90_float_parse(line.strip().split()[4])
            )
            castep["_beef"]["total_energy"].append(
                HARTREE_TO_EV * f90_float_parse(line.strip().split()[6])
            )
            castep["_beef"]["xc_energy_per_atom"].append(
                castep["_beef"]["xc_energy"][-1] / castep["num_atoms"]
            )
            castep["_beef"]["total_energy_per_atom"].append(
                castep["_beef"]["total_energy"][-1] / castep["num_atoms"]
            )
        if "BEEF completed" in line:
            break
    else:
        raise RuntimeError("End of BEEF estimate not found.")

    castep["_beef"]["mean_total_energy"] = np.mean(castep["_beef"]["total_energy"])
    castep["_beef"]["mean_total_energy_per_atom"] = (
        np.mean(castep["_beef"]["total_energy"]) / castep["num_atoms"]
    )
    castep["_beef"]["std_dev_total_energy"] = np.std(castep["_beef"]["total_energy"])
    castep["_beef"]["std_dev_total_energy_per_atom"] = (
        np.std(castep["_beef"]["total_energy"]) / castep["num_atoms"]
    )


def _castep_scrape_devel_code(flines, castep):
    """Scrape the contents of the developer code block and
    extract any information about nanotube encapsulation.
    Searches for then scrapes the last occurence of "Developer
    Code". If nanotube information is found, the `encapsulated`
    flag is set to True.

    Parameters:
        flines (list): CASTEP output flines to scrape.
        castep (dict): dictionary to update with BEEF output.

    """
    last_devel_code = len(flines)
    for line_no, line in enumerate(flines):
        if line.startswith(" *") and "**** Developer Code ****" in line:
            last_devel_code = line_no

    castep["devel_code"] = []
    for line_no, line in enumerate(flines[last_devel_code + 1 :]):
        # look for end of block
        if "*************" in line.strip():
            break

        line = line.strip()
        if line:
            castep["devel_code"].append(line)

    devel_code = "\n".join(castep["devel_code"])
    if "gaussian_cylinder" in devel_code.lower():
        castep["encapsulated"] = True
        for line in castep["devel_code"]:
            if "radius" in line.lower():
                castep["cnt_radius"] = float(line.strip().split()[-1])

    # store as string rather than list
    castep["devel_code"] = devel_code


def _optados_get_projector_labels(header):
    """Get OptaDOS projector labels from a pdos.dat or pdis.dat file,
    returning None-padded labels as `('species', 'ang_mom', 'spin channel')`,
    e.g. ('Li', 's', None) or ('Cr', None, 'up').

    Parameters:
        header (str): the file header containing projector labels.

    Returns:
        list(tuple(str, str, str)): projector labels formatted as above.

    """
    projectors = []
    for ind, line in enumerate(header):
        separators = ["Projector:", "Column:"]
        if any(sep in line for sep in separators):
            is_spin_pdos = "Spin" in header[ind + 1]
            # skip current line and column headings
            j = 2
            elements = []
            ang_mom_channels = []
            spin_channels = []
            # loop over file finding projector header blocks
            while ind + j + 1 < len(header) and not any(
                keyword in header[ind + j + 1] for keyword in separators
            ):
                elements.append(header[ind + j].split()[1])
                ang_mom_channels.append(header[ind + j].split()[3])
                if is_spin_pdos:
                    spin_channels.append(header[ind + j].split()[4].lower())
                j += 1

            projector_label = []

            # check that this projector contains exactly 1 species
            if len(set(elements)) == 1:
                projector_label.append(elements[0])
            else:
                projector_label.append(None)

            # check that this projector has exactly 1 ang mom channel
            if len(set(ang_mom_channels)) == 1:
                projector_label.append(ang_mom_channels[0])
            else:
                projector_label.append(None)

            # check that this projector has exactly 1 spin channel
            if len(set(spin_channels)) == 1:
                projector_label.append(spin_channels[0])
            else:
                projector_label.append(None)

            projector_label = tuple(projector_label)
            projectors.append(projector_label)

    return projectors


[docs]def get_seed_metadata(doc, seed):
    """For a given document and seedname, look for
    ICSD CollCode, MaterialsProject IDs and DOIs to add
    to the document.

    Parameters:
        doc (dict): the input document.
        seed (str): the filename that is being scraped (sans file extension).

    """
    errors = []
    if "-CollCode-" in seed:
        try:
            doc["icsd"] = int(
                seed.split("CollCode-")[-1].split("-")[0].split("_")[0].split(".")[0]
            )
        except ValueError as exc:
            errors.append(exc)
    elif "CollCode" in seed:
        try:
            doc["icsd"] = int(
                seed.split("CollCode")[-1].split("-")[0].split("_")[0].split(".")[0]
            )
        except ValueError as exc:
            errors.append(exc)
    elif "-ICSD-" in seed:
        try:
            doc["icsd"] = int(
                seed.split("-ICSD-")[-1].split("-")[0].split("_")[0].split(".")[0]
            )
        except ValueError as exc:
            errors.append(exc)
    if "-OQMD-" in seed:
        try:
            doc["oqmd_id"] = int(
                seed.split("-OQMD-")[-1].split("-")[0].split("_")[0].split(".")[0]
            )
        except ValueError as exc:
            errors.append(exc)
    elif "-OQMD_" in seed:
        try:
            doc["oqmd_id"] = int(
                seed.split("-OQMD_")[-1].split("-")[0].split("_")[0].split(".")[0]
            )
        except ValueError as exc:
            errors.append(exc)
    if "-MP-" in seed:
        try:
            doc["mp_id"] = int(
                seed.split("-MP-")[-1].split("-")[0].split("_")[0].split(".")[0]
            )
        except ValueError as exc:
            errors.append(exc)
    elif "-MP_" in seed:
        try:
            doc["mp_id"] = int(
                seed.split("-MP_")[-1].split("-")[0].split("_")[0].split(".")[0]
            )
        except ValueError as exc:
            errors.append(exc)
    if "-DOI-" in seed:
        try:
            doc["doi"] = seed.split("-DOI-")[-1].split("-")[0].replace("__", "/")
        except ValueError as exc:
            errors.append(exc)

    if errors:
        raise ValueError(errors)