import importlib
import logging
import warnings
from functools import lru_cache
from pathlib import Path
from typing import Callable, Dict, List, Optional, Tuple
from metatensor.torch import TensorMap
from metatomic.torch import System
from omegaconf import DictConfig
from ..target_info import TargetInfo
AVAILABLE_READERS = ["ase", "metatensor"]
""":py:class:`list`: list containing all implemented reader libraries"""
DEFAULT_READER = {
".xyz": "ase",
".extxyz": "ase",
".mts": "metatensor",
}
""":py:class:`dict`: mapping file extensions to a default reader"""
logger = logging.getLogger(__name__)
[docs]
def read_systems(
filename: str,
reader: Optional[str] = None,
) -> List[System]:
"""Read system informations from a file.
:param filename: name of the file to read
:param reader: reader library for parsing the file. If :py:obj:`None` the library is
is tried to determined from the file extension.
:param dtype: desired data type of returned tensor
:returns: list of systems
determined from the file extension.
:raises ValueError: if no reader is found or data not in double precision
"""
# Determine reader if not provided
if reader is None:
file_suffix = Path(filename).suffix
try:
reader = DEFAULT_READER[file_suffix]
except KeyError:
raise ValueError(
f"File extension {file_suffix!r} is not linked to a default reader "
"library. You can try reading it by setting a specific 'reader' from "
f"the known ones: {', '.join(AVAILABLE_READERS)} "
)
module = _load_reader_module(reader)
# Fetch and call read_systems
try:
reader_fn = module.read_systems
except AttributeError as e:
raise ValueError(
f"Reader library {reader!r} cannot read systems."
f"You can try with other readers: {AVAILABLE_READERS}"
) from e
systems = reader_fn(filename)
# elements in data are `torch.ScriptObject`s and their `dtype` is an integer.
# A C++ double/torch.float64 is `7` according to
# https://github.com/pytorch/pytorch/blob/207564bab1c4fe42750931765734ee604032fb69/c10/core/ScalarType.h#L54-L93
if not all(s.dtype == 7 for s in systems):
raise ValueError("The loaded systems are not in double precision.")
return systems
[docs]
def read_targets(
conf: DictConfig,
) -> Tuple[Dict[str, List[TensorMap]], Dict[str, TargetInfo]]:
"""Reading all target information from a fully expanded config.
To get such a config you can use :func:`expand_dataset_config
<metatrain.utils.omegaconf.expand_dataset_config>`. All targets are stored in double
precision.
This function uses subfunctions like :func:`read_energy` to parse the requested
target quantity. Currently only `energy` is a supported target property. But, within
the `energy` section gradients such as `forces`, the `stress` or the `virial` can be
added. Other gradients are silently ignored.
:param conf: config containing the keys for what should be read.
:returns: Dictionary containing a list of TensorMaps for each target section in the
config as well as a ``Dict[str, TargetInfo]`` object containing the metadata of
the targets.
:raises ValueError: if the target name is not valid. Valid target names are those
that either start with ``mtt::`` or those that are in the list of standard
outputs of ``metatomic`` (see
https://docs.metatensor.org/metatomic/latest/outputs/)
"""
return _read_conf_section(
conf,
decide_reader=_decide_target_reader,
validate_key=_validate_target,
)
def read_extra_data(
conf: DictConfig,
) -> Tuple[Dict[str, List[TensorMap]], Dict[str, TargetInfo]]:
"""Read extra data from a fully expanded config.
This function is similar to :func:`read_targets`, but it is used to read additional
data that is not part of the main targets. It can be used to read auxiliary data
that might be useful for training or evaluation.
:param conf: config containing the keys for what should be read.
:returns: Dictionary containing a list of TensorMaps for each extra data section in
the config as well as a ``Dict[str, TargetInfo]`` object containing the metadata
of the extra data.
"""
return _read_conf_section(
conf,
decide_reader=_decide_generic_reader,
validate_key=_no_validate,
)
def _read_conf_section(
conf: DictConfig,
decide_reader: Callable[[str, DictConfig], str],
validate_key: Callable[[str, DictConfig], None],
) -> Tuple[Dict[str, List[TensorMap]], Dict[str, TargetInfo]]:
"""
Generic loader for any DictConfig section (targets, extra_data, …).
:param conf: mapping of section names to entry configs
:param decide_reader: callback(key, entry) -> either "energy" or "generic"
:param validate_key: callback(key, entry) -> None (may raise or log)
:returns: (data_dict, info_dict)
:raises ValueError: on unsupported file types, readers, or dtype mismatch
"""
data_dict: Dict[str, List[TensorMap]] = {}
info_dict: Dict[str, TargetInfo] = {}
for key, entry in conf.items():
# section-specific key validation
validate_key(key, entry)
# decide which reader method to call
reader_kind = decide_reader(key, entry)
# resolve reader name (explicit or default via suffix)
reader = entry.get("reader")
filename = entry.get("read_from")
if reader is None:
suffix = Path(filename).suffix
try:
reader = DEFAULT_READER[suffix]
except KeyError:
raise ValueError(
f"File extension {suffix!r} has no default reader. "
f"Set 'reader' explicitly from: {AVAILABLE_READERS}"
)
module = _load_reader_module(reader)
# fetch the appropriate read_* function
method_name = f"read_{reader_kind}"
try:
reader_fn = getattr(module, method_name)
except AttributeError as e:
available = [m for m in dir(module) if m.startswith("read_")]
raise ValueError(
f"Reader {reader!r} has no method {method_name!r}. "
f"Available methods: {available}"
) from e
# execute reader and collect outputs
tensormaps, info = reader_fn(entry)
# enforce double precision (dtype == 7)
if not all(t.dtype == 7 for t in tensormaps):
raise ValueError(f"Data for '{key}' not in double precision (dtype==7).")
data_dict[key] = tensormaps
info_dict[key] = info
return data_dict, info_dict
# Callbacks for targets
_standard_outputs_list = {
"energy",
"non_conservative_forces",
"non_conservative_stress",
}
def _validate_target(key: str, entry: DictConfig) -> None:
if key not in _standard_outputs_list and not key.startswith("mtt::"):
if key.lower() in {"force", "forces", "virial", "stress"}:
warnings.warn(
f"{key!r} should not be its own top-level target, "
"but rather a sub-section of the 'energy' target",
stacklevel=2,
)
else:
raise ValueError(
f"Target name ({key}) must either be one of "
f"{_standard_outputs_list} or start with `mtt::`."
)
if any(name in key.lower() for name in ("force", "virial", "stress")):
warnings.warn(
f"the name of {key!r} resembles to a gradient of "
"energies; it should probably not be its own top-level target, "
"but rather a gradient sub-section of a target with the "
"`energy` quantity",
stacklevel=2,
)
def _decide_target_reader(key: str, entry: DictConfig) -> str:
is_energy = (
entry.get("quantity") == "energy"
and not entry.get("per_atom", False)
and entry.get("num_subtargets", 1) == 1
and entry.get("type") == "scalar"
)
return "energy" if is_energy else "generic"
# Callbacks for "extra_data"
def _no_validate(key: str, entry: DictConfig) -> None:
pass
def _decide_generic_reader(key: str, entry: DictConfig) -> str:
return "generic"
@lru_cache(maxsize=None)
def _load_reader_module(reader_name: str):
"""
Load (and cache) a reader module by name.
Raises ValueError if the module cannot be imported.
"""
module_path = f"metatrain.utils.data.readers.{reader_name}"
try:
return importlib.import_module(module_path)
except ImportError as e:
raise ValueError(
f"Reader library {reader_name!r} not supported. Choose from "
f"{', '.join(AVAILABLE_READERS)}"
) from e