diff --git a/.gitignore b/.gitignore index b875f77145..d2a6816f9f 100644 --- a/.gitignore +++ b/.gitignore @@ -72,13 +72,17 @@ build/* *.log *.xml +# Test fixtures: real ESS log files / XML are tracked under arc/testing/ +!arc/testing/**/*.log +!arc/testing/**/*.xml -# AI Agent files +# AI Agent files and folders AGENTS.md spec.md +.vexb/* # Other AI things .agents ARC.egg* uv* -*graphify* \ No newline at end of file +*graphify* diff --git a/ARC.py b/ARC.py index 0707ec2130..bba6da4c2c 100644 --- a/ARC.py +++ b/ARC.py @@ -11,6 +11,8 @@ from arc.common import read_yaml_file from arc.main import ARC +from arc.tckdb.config import TCKDBConfig +from arc.tckdb.sweep import run_upload_sweep def parse_command_line_arguments(command_line_args=None): @@ -59,8 +61,32 @@ def main(): input_dict['verbose'] = input_dict['verbose'] if 'verbose' in input_dict else verbose if 'project_directory' not in input_dict or not input_dict['project_directory']: input_dict['project_directory'] = project_directory + + tckdb_config = TCKDBConfig.from_dict(input_dict.pop('tckdb', None)) + arc_object = ARC(**input_dict) - arc_object.execute() + arc_object.tckdb_config = tckdb_config + if tckdb_config is not None: + print(f'TCKDB integration enabled: {tckdb_config.base_url}') + + # Persistent SSH pool lives for the duration of the run; close it + # explicitly on every exit path (success, error, ctrl-C) so we don't + # leave paramiko Transports orphaned. Lazily instantiated on first + # remote-queue job, so this is a no-op for fully-local runs. + try: + arc_object.execute() + + if tckdb_config is not None: + from arc.tckdb.adapter import TCKDBAdapter + adapter = TCKDBAdapter(tckdb_config, project_directory=arc_object.project_directory) + run_upload_sweep( + adapter=adapter, + project_directory=arc_object.project_directory, + tckdb_config=tckdb_config, + ) + finally: + from arc.job.ssh_pool import reset_default_pool + reset_default_pool() if __name__ == '__main__': diff --git a/ARC_test.py b/ARC_test.py new file mode 100644 index 0000000000..bad07b5825 --- /dev/null +++ b/ARC_test.py @@ -0,0 +1,354 @@ +"""Tests for the ARC.py end-of-run TCKDB upload sweep dispatcher. + +These tests focus on the wiring between ``tckdb.upload_mode`` and the +adapter method that gets called per species. They use a stub adapter so +no network or live ARC objects are required. +""" + +import os +import shutil +import tempfile +import unittest +from pathlib import Path +from types import SimpleNamespace + +import yaml + +from arc.tckdb.adapter import UploadOutcome +from arc.tckdb.config import TCKDBConfig +from arc.tckdb.sweep import _resolve_artifact_path, run_upload_sweep + + +# -------------------------------------------------------------------------- +# Test doubles +# -------------------------------------------------------------------------- + + +class _StubAdapter: + """Records which adapter method was called per species, no network.""" + + def __init__(self, *, conformer_outcome=None, bundle_outcome=None, + conformer_raises=None, bundle_raises=None): + self.conformer_calls = [] + self.bundle_calls = [] + self.artifact_calls = [] + self._conformer_outcome = conformer_outcome + self._bundle_outcome = bundle_outcome + self._conformer_raises = conformer_raises + self._bundle_raises = bundle_raises + + def submit_from_output(self, *, output_doc, species_record): + self.conformer_calls.append(species_record.get("label")) + if self._conformer_raises is not None: + raise self._conformer_raises + return self._conformer_outcome + + def submit_computed_species_from_output(self, *, output_doc, species_record): + self.bundle_calls.append(species_record.get("label")) + if self._bundle_raises is not None: + raise self._bundle_raises + return self._bundle_outcome + + def submit_artifacts_for_calculation(self, **kwargs): + self.artifact_calls.append(kwargs) + return None + + +def _outcome(status, *, label="ethanol", error=None, + primary=None, additional=None): + """Build a stand-in UploadOutcome with the fields the sweep reads.""" + return UploadOutcome( + status=status, + payload_path=Path(f"/tmp/{label}.payload.json"), + sidecar_path=Path(f"/tmp/{label}.meta.json"), + idempotency_key=f"arc:test:{label}:k:abc1234567890def", + error=error, + primary_calculation=primary, + additional_calculations=additional or [], + ) + + +# -------------------------------------------------------------------------- +# Fixtures +# -------------------------------------------------------------------------- + + +def _write_output_yml(project_dir: str, *, species_labels=("CCO",), with_ts=False): + """Write a minimal ``output.yml`` matching what the sweep reads.""" + out_dir = os.path.join(project_dir, "output") + os.makedirs(out_dir, exist_ok=True) + doc = { + "schema_version": "1.0", + "project": "test_project", + "arc_version": "0.0.0", + "opt_level": {"method": "wb97xd", "basis": "def2-tzvp", "software": "gaussian"}, + "species": [ + { + "label": label, + "smiles": "CCO", + "charge": 0, + "multiplicity": 1, + "is_ts": False, + "converged": True, + "xyz": "C 0.0 0.0 0.0\nH 1.0 0.0 0.0", + "opt_n_steps": 12, + "opt_final_energy_hartree": -154.0, + "ess_versions": {"opt": "Gaussian 16, Revision A.03"}, + } + for label in species_labels + ], + "transition_states": [ + {"label": "TS0", "is_ts": True, "converged": True} + ] if with_ts else [], + } + with open(os.path.join(out_dir, "output.yml"), "w") as f: + yaml.safe_dump(doc, f) + return doc + + +# -------------------------------------------------------------------------- +# Dispatch behavior +# -------------------------------------------------------------------------- + + +class TestRunTckdbUploadSweepDispatch(unittest.TestCase): + """Wiring tests: which adapter method gets called per upload_mode.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-sweep-test-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + _write_output_yml(self.tmp) + self.arc_object = SimpleNamespace(project_directory=self.tmp) + + def _cfg(self, **overrides): + defaults = dict( + enabled=True, + base_url="http://localhost:8000/api/v1", + api_key_env="X_TCKDB_API_KEY", + ) + defaults.update(overrides) + return TCKDBConfig(**defaults) + + # ---------------- 1: missing upload_mode → conformer (default) + def test_default_mode_uses_legacy_conformer_path(self): + cfg = self._cfg() # upload_mode defaults to "conformer" + adapter = _StubAdapter(conformer_outcome=_outcome("uploaded")) + run_upload_sweep(adapter=adapter, project_directory=self.arc_object.project_directory, tckdb_config=cfg) + self.assertEqual(adapter.conformer_calls, ["CCO"]) + self.assertEqual(adapter.bundle_calls, []) + + # ---------------- 2: explicit conformer + def test_explicit_conformer_mode_uses_legacy_path(self): + cfg = self._cfg(upload_mode="conformer") + adapter = _StubAdapter(conformer_outcome=_outcome("uploaded")) + run_upload_sweep(adapter=adapter, project_directory=self.arc_object.project_directory, tckdb_config=cfg) + self.assertEqual(adapter.conformer_calls, ["CCO"]) + self.assertEqual(adapter.bundle_calls, []) + + # ---------------- 3: computed_species → bundle path + def test_computed_species_mode_dispatches_bundle(self): + cfg = self._cfg(upload_mode="computed_species") + adapter = _StubAdapter(bundle_outcome=_outcome("uploaded")) + run_upload_sweep(adapter=adapter, project_directory=self.arc_object.project_directory, tckdb_config=cfg) + self.assertEqual(adapter.bundle_calls, ["CCO"]) + self.assertEqual(adapter.conformer_calls, []) + + # ---------------- 4: bundle path never calls legacy + def test_computed_species_does_not_call_legacy_submit(self): + # Multiple species so we'd notice any leak across iterations. + _write_output_yml(self.tmp, species_labels=("CCO", "CO", "CC")) + cfg = self._cfg(upload_mode="computed_species") + adapter = _StubAdapter(bundle_outcome=_outcome("uploaded")) + run_upload_sweep(adapter=adapter, project_directory=self.arc_object.project_directory, tckdb_config=cfg) + self.assertEqual(adapter.bundle_calls, ["CCO", "CO", "CC"]) + self.assertEqual(adapter.conformer_calls, []) + # And no per-artifact sweep call: bundles inline artifacts. + self.assertEqual(adapter.artifact_calls, []) + + # ---------------- 5: failure in bundle mode is recorded; sweep continues + def test_computed_species_failure_continues_to_next_species(self): + _write_output_yml(self.tmp, species_labels=("CCO", "CO")) + cfg = self._cfg(upload_mode="computed_species") + # First species: outcome with status=failed (non-strict path). + # Second species: outcome with status=uploaded. + # We achieve "different per call" by mutating the stub's outcome + # mid-sweep, since the stub returns the same outcome each call by + # default. Use a side-effect via a wrapper instead. + outcomes = iter([ + _outcome("failed", label="CCO", error="HTTP 503"), + _outcome("uploaded", label="CO"), + ]) + adapter = _StubAdapter() + adapter.submit_computed_species_from_output = ( + lambda *, output_doc, species_record: ( + adapter.bundle_calls.append(species_record.get("label")) + or next(outcomes) + ) + ) + run_upload_sweep(adapter=adapter, project_directory=self.arc_object.project_directory, tckdb_config=cfg) + # Both species processed; first failed, second uploaded. + self.assertEqual(adapter.bundle_calls, ["CCO", "CO"]) + + # ---------------- 5b: an unhandled exception in bundle mode is caught + def test_computed_species_exception_is_caught_and_logged(self): + _write_output_yml(self.tmp, species_labels=("CCO", "CO")) + cfg = self._cfg(upload_mode="computed_species") + # Simulate an unhandled exception on the FIRST species; second + # should still be attempted (matches conformer-mode behavior). + call_log = [] + def fake_submit(*, output_doc, species_record): + label = species_record.get("label") + call_log.append(label) + if label == "CCO": + raise RuntimeError("boom") + return _outcome("uploaded", label=label) + adapter = _StubAdapter() + adapter.submit_computed_species_from_output = fake_submit + run_upload_sweep(adapter=adapter, project_directory=self.arc_object.project_directory, tckdb_config=cfg) + self.assertEqual(call_log, ["CCO", "CO"]) + + # ---------------- 6: sidecar written before live upload failure (bundle) + def test_bundle_mode_sidecar_written_before_upload_failure(self): + # This is fundamentally an adapter-level guarantee, but we verify + # the wiring preserves it: a "failed" outcome carrying real + # payload_path and sidecar_path values means the sweep still + # passes those upward to the user. + cfg = self._cfg(upload_mode="computed_species") + sentinel_payload = Path("/tmp/sentinel.payload.json") + sentinel_sidecar = Path("/tmp/sentinel.meta.json") + outcome = UploadOutcome( + status="failed", + payload_path=sentinel_payload, + sidecar_path=sentinel_sidecar, + idempotency_key="arc:t:CCO:c:abc1234567890def", + error="HTTP 503", + ) + adapter = _StubAdapter(bundle_outcome=outcome) + # Capture stdout to confirm the failure summary is printed + # (don't assert on exact text — assert on key tokens). + with unittest.mock.patch("builtins.print") as mock_print: + run_upload_sweep(adapter=adapter, project_directory=self.arc_object.project_directory, tckdb_config=cfg) + printed = "\n".join(str(c.args[0]) for c in mock_print.call_args_list) + self.assertIn("computed-species bundle", printed) + self.assertIn("failed: 1", printed) + self.assertIn("HTTP 503", printed) + + +# -------------------------------------------------------------------------- +# Summary-print mode awareness +# -------------------------------------------------------------------------- + + +class TestSweepSummaryByMode(unittest.TestCase): + """The summary line names the mode; bundle mode omits the artifact line.""" + + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="arc-sweep-summary-") + self.addCleanup(shutil.rmtree, self.tmp, ignore_errors=True) + _write_output_yml(self.tmp) + self.arc_object = SimpleNamespace(project_directory=self.tmp) + + def _run_with_mode(self, *, upload_mode, artifacts_upload=False): + from arc.tckdb.config import TCKDBArtifactConfig + cfg = TCKDBConfig( + enabled=True, base_url="http://x", api_key_env="X", + upload_mode=upload_mode, + artifacts=TCKDBArtifactConfig(upload=artifacts_upload), + ) + adapter = _StubAdapter( + conformer_outcome=_outcome("uploaded"), + bundle_outcome=_outcome("uploaded"), + ) + with unittest.mock.patch("builtins.print") as mock_print: + run_upload_sweep(adapter=adapter, project_directory=self.arc_object.project_directory, tckdb_config=cfg) + return "\n".join(str(c.args[0]) for c in mock_print.call_args_list) + + def test_conformer_mode_summary_says_conformer(self): + out = self._run_with_mode(upload_mode="conformer") + self.assertIn("conformer/calculation", out) + self.assertNotIn("computed-species bundle", out) + + def test_bundle_mode_summary_says_bundle(self): + out = self._run_with_mode(upload_mode="computed_species") + self.assertIn("computed-species bundle", out) + self.assertNotIn("conformer/calculation", out) + + def test_bundle_mode_omits_artifact_line_even_when_enabled(self): + # Inline artifacts mean the standalone artifact tally would mislead. + out = self._run_with_mode(upload_mode="computed_species", artifacts_upload=True) + self.assertNotIn("artifacts: uploaded", out) + + def test_conformer_mode_emits_artifact_line_when_enabled(self): + out = self._run_with_mode(upload_mode="conformer", artifacts_upload=True) + self.assertIn("artifacts:", out) + + +# -------------------------------------------------------------------------- +# _resolve_artifact_path: prefer recorded _input over derivation +# -------------------------------------------------------------------------- + + +class TestResolveArtifactPath(unittest.TestCase): + """The legacy artifact sweep prefers ``output.yml``'s ``_input`` + field, falling back to settings-based derivation only when absent.""" + + def test_input_kind_prefers_recorded_field(self): + """When ``opt_input`` is on the record, it wins over the derived path.""" + species_record = { + "opt_log": "calcs/CH4/opt/input.log", + "opt_input": "calcs/CH4/opt/explicit_input.gjf", # NEW field + } + output_doc = {"opt_level": {"software": "gaussian"}} + path = _resolve_artifact_path( + kind="input", calc_type="opt", + species_record=species_record, output_doc=output_doc, + ) + self.assertEqual(path, "calcs/CH4/opt/explicit_input.gjf") + + def test_input_kind_falls_back_to_settings_when_field_absent(self): + """Older output.yml without ``_input`` still resolves via settings.""" + species_record = {"opt_log": "/abs/calcs/CH4/opt/input.log"} + output_doc = {"opt_level": {"software": "gaussian"}} + path = _resolve_artifact_path( + kind="input", calc_type="opt", + species_record=species_record, output_doc=output_doc, + ) + # Derived sibling: input.gjf next to the log. + self.assertEqual(path, "/abs/calcs/CH4/opt/input.gjf") + + def test_input_kind_falls_back_when_recorded_field_is_none(self): + """Explicit ``None`` in the record (deck wasn't kept) → fallback.""" + species_record = { + "opt_log": "/abs/calcs/CH4/opt/input.log", + "opt_input": None, + } + output_doc = {"opt_level": {"software": "gaussian"}} + path = _resolve_artifact_path( + kind="input", calc_type="opt", + species_record=species_record, output_doc=output_doc, + ) + self.assertEqual(path, "/abs/calcs/CH4/opt/input.gjf") + + def test_input_kind_per_job_picks_correct_recorded_field(self): + """Different calcs hit different ``_input`` fields, not all opt's.""" + species_record = { + "opt_log": "/abs/opt.log", "opt_input": "/abs/opt_deck.gjf", + "freq_log": "/abs/freq.log", "freq_input": "/abs/freq_deck.gjf", + "sp_log": "/abs/sp.log", "sp_input": "/abs/sp_deck.in", # cross-software run + } + output_doc = {"opt_level": {"software": "gaussian"}} + for calc, expected in ( + ("opt", "/abs/opt_deck.gjf"), + ("freq", "/abs/freq_deck.gjf"), + ("sp", "/abs/sp_deck.in"), # NOT input.gjf — sp uses its own software + ): + path = _resolve_artifact_path( + kind="input", calc_type=calc, + species_record=species_record, output_doc=output_doc, + ) + self.assertEqual(path, expected, + msg=f"{calc}: expected {expected}, got {path}") + + +if __name__ == "__main__": + unittest.main() diff --git a/Makefile b/Makefile index a5117b454e..8a95d595bb 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,7 @@ help: @echo " install-kinbot Install KinBot" @echo " install-sella Install Sella" @echo " install-xtb Install xTB" + @echo " install-crest Install CREST" @echo " install-torchani Install TorchANI" @echo " install-ob Install OpenBabel" @echo "" @@ -96,6 +97,9 @@ install-sella: install-xtb: bash $(DEVTOOLS_DIR)/install_xtb.sh +install-crest: + bash $(DEVTOOLS_DIR)/install_crest.sh + install-torchani: bash $(DEVTOOLS_DIR)/install_torchani.sh diff --git a/arc/constants.pxd b/arc/constants.pxd index 67b3f412bb..9fc3b9127d 100644 --- a/arc/constants.pxd +++ b/arc/constants.pxd @@ -1 +1 @@ -cdef double pi, Na, kB, R, h, hbar, c, e, m_e, m_p, m_n, amu, a0, bohr_to_angstrom, E_h, F, E_h_kJmol +cdef double pi, Na, kB, R, h, hbar, c, e, m_e, m_p, m_n, amu, a0, E_h, F, E_h_kJmol, bohr_to_angstrom, angstrom_to_bohr diff --git a/arc/constants.py b/arc/constants.py index 771c09a05d..9ee8a4ac8b 100644 --- a/arc/constants.py +++ b/arc/constants.py @@ -79,6 +79,9 @@ #: Vacuum permittivity epsilon_0 = 8.8541878128 +bohr_to_angstrom = 0.529177 +angstrom_to_bohr = 1 / bohr_to_angstrom + # Cython does not automatically place module-level variables into the module # symbol table when in compiled mode, so we must do this manually so that we # can use the constants from both Cython and regular Python code @@ -101,4 +104,5 @@ 'F': F, 'epsilon_0': epsilon_0, 'bohr_to_angstrom': bohr_to_angstrom, + 'angstrom_to_bohr': angstrom_to_bohr, }) diff --git a/arc/imports.py b/arc/imports.py index ba385c3770..ff4be11939 100644 --- a/arc/imports.py +++ b/arc/imports.py @@ -10,13 +10,30 @@ from arc.settings.submit import incore_commands, pipe_submit, submit_scripts +def _local_overlays_disabled() -> bool: + """Return True when ~/.arc/{settings,submit,inputs}.py overlays should be skipped. + + The repo's arc/settings/settings.py is always loaded as the baseline. This + guard only controls whether a user's personal ~/.arc/*.py files are layered + on top — we want them ignored under pytest (so tests see the same defaults + CI does) and overridable explicitly via env var. + + Triggered by either pytest being loaded (`'pytest' in sys.modules`, true + from collection onward) or an explicit ARC_IGNORE_LOCAL_SETTINGS=1 env var. + """ + if os.environ.get('ARC_IGNORE_LOCAL_SETTINGS') == '1': + return True + return 'pytest' in sys.modules + + # Common imports where the user can optionally put a modified copy of settings.py or submit.py file under ~/.arc home = os.getenv("HOME") or os.path.expanduser("~") local_arc_path = os.path.join(home, '.arc') +_skip_local = _local_overlays_disabled() local_arc_settings_path = os.path.join(local_arc_path, 'settings.py') settings = {key: val for key, val in vars(arc_settings).items() if '__' not in key} -if os.path.isfile(local_arc_settings_path): +if not _skip_local and os.path.isfile(local_arc_settings_path): local_settings = dict() if local_arc_path not in sys.path: sys.path.insert(1, local_arc_path) @@ -32,7 +49,7 @@ if 'global_ess_settings' in local_settings_dict and local_settings_dict['global_ess_settings'] else None local_arc_submit_path = os.path.join(local_arc_path, 'submit.py') -if os.path.isfile(local_arc_submit_path): +if not _skip_local and os.path.isfile(local_arc_submit_path): local_incore_commands, local_pipe_submit, local_submit_scripts = dict(), dict(), dict() if local_arc_path not in sys.path: sys.path.insert(1, local_arc_path) @@ -56,7 +73,7 @@ submit_scripts.update(local_submit_scripts) local_arc_inputs_path = os.path.join(local_arc_path, 'inputs.py') -if os.path.isfile(local_arc_inputs_path): +if not _skip_local and os.path.isfile(local_arc_inputs_path): local_input_files = dict() if local_arc_path not in sys.path: sys.path.insert(1, local_arc_path) diff --git a/arc/job/adapter.py b/arc/job/adapter.py index 9f47ea0c6a..d4b146a9e8 100644 --- a/arc/job/adapter.py +++ b/arc/job/adapter.py @@ -109,6 +109,7 @@ class JobEnum(str, Enum): autotst = 'autotst' # AutoTST, 10.1021/acs.jpca.7b07361, 10.26434/chemrxiv.13277870.v2 gcn = 'gcn' # Graph neural network for isomerization, https://doi.org/10.1021/acs.jpclett.0c00500 heuristics = 'heuristics' # ARC's heuristics + crest = 'crest' # CREST conformer/TS search kinbot = 'kinbot' # KinBot, 10.1016/j.cpc.2019.106947 linear = 'linear' # ARC's linear TS search user = 'user' # user guesses @@ -220,9 +221,82 @@ def execute(self): with an HDF5 file that contains specific directions. The output is returned within the HDF5 file. The new ARC instance, representing a single worker, will run all of its jobs incore. + + Connection sharing: for remote-queue jobs we lease one + :class:`SSHClient` from the process-global pool + (:mod:`arc.job.ssh_pool`) and reuse it for both file upload and + qsub/sbatch submission within this call. Across an entire ARC + run, every remote job for a given server reuses the *same* + pooled client — 100 TS guess opts share one paramiko Transport + instead of opening 200. Pipe mode currently can't bundle these + (``should_use_pipe`` refuses non-``local`` servers, see + ``arc/job/pipe/pipe_coordinator.py:77``); the pool is the + leverage available short of full remote-pipe support. """ - self.upload_files() execution_type = JobExecutionTypeEnum(self.execution_type) + use_shared_ssh = ( + execution_type == JobExecutionTypeEnum.queue + and self.server is not None + and self.server != 'local' + and not self.testing + ) + if use_shared_ssh: + from arc.job.ssh_pool import get_default_pool + with get_default_pool().borrow(self.server) as ssh: + self._shared_ssh = ssh + try: + self._dispatch_execution(execution_type) + finally: + # Pool retains the SSHClient; clearing the attr + # just prevents a later code path on this adapter + # from grabbing a stale reference if the pool + # subsequently reaps and reopens the connection. + self._shared_ssh = None + else: + self._dispatch_execution(execution_type) + if not self.restarted: + self._write_initiated_job_to_csv_file() + + def _open_or_borrow_ssh(self): + """Yield an :class:`SSHClient` for ``self.server``, in priority order: + + 1. ``self._shared_ssh`` if set — the per-call client opened by + :meth:`execute`. Available within the upload+submit window. + 2. The process-global pool (:mod:`arc.job.ssh_pool`) — keeps + one client alive across jobs for the run's lifetime, so the + hot status-poll loop reuses connections. + 3. A fresh ``SSHClient`` opened just for this call — only hit + when the pool can't construct one (testing, exotic env). + + Returns a context manager that does NOT close the underlying + client on exit; the pool retains ownership in case (2), and + case (3) opens-and-closes inline. + """ + from contextlib import contextmanager + shared = getattr(self, '_shared_ssh', None) + if shared is not None: + @contextmanager + def _shared_cm(): + yield shared + return _shared_cm() + try: + from arc.job.ssh_pool import get_default_pool + return get_default_pool().borrow(self.server) + except Exception: + # Pool refused (e.g., factory failed). Fall back to a + # one-shot client so we degrade gracefully — the caller + # gets correctness at the cost of one connection. + logger.debug("ssh pool unavailable; opening one-shot client", exc_info=True) + @contextmanager + def _fresh_cm(): + with SSHClient(self.server) as fresh: + yield fresh + return _fresh_cm() + + def _dispatch_execution(self, execution_type: 'JobExecutionTypeEnum') -> None: + """Inner body of :meth:`execute`, factored out so the SSH-share + wrapper around it stays small and readable.""" + self.upload_files() if execution_type == JobExecutionTypeEnum.incore: self.initial_time = datetime.datetime.now() self.job_status[0] = 'running' @@ -237,19 +311,25 @@ def execute(self): raise ValueError('Pipe execution is handled at the Scheduler level. ' 'JobAdapters inside a pipe must be executed by the worker ' "with execution_type='incore'.") - if not self.restarted: - self._write_initiated_job_to_csv_file() - def legacy_queue_execution(self): + def legacy_queue_execution(self, ssh: 'SSHClient | None' = None): """ Execute a job to the server's queue. The server could be either "local" or remote. + + ``ssh`` is an explicitly-passed shared connection. When ``None`` + we route through :meth:`_open_or_borrow_ssh` which prefers + ``self._shared_ssh`` (set by :meth:`execute`), then the + process-global pool, then opens fresh. """ self._log_job_execution() # Submit to queue, differentiate between local (same machine using its queue) and remote servers. if self.server != 'local': - with SSHClient(self.server) as ssh: + if ssh is not None: self.job_status[0], self.job_id = ssh.submit_job(remote_path=self.remote_path) + else: + with self._open_or_borrow_ssh() as borrowed: + self.job_status[0], self.job_id = borrowed.submit_job(remote_path=self.remote_path) else: # submit to the local queue self.job_status[0], self.job_id = submit_job(path=self.local_path) @@ -365,26 +445,24 @@ def set_file_paths(self): self.set_additional_file_paths() - def upload_files(self): + def upload_files(self, ssh: 'SSHClient | None' = None): """ Upload the relevant files for the job. + + ``ssh`` is an explicitly-passed shared connection. When ``None`` + we route through :meth:`_open_or_borrow_ssh` which prefers + ``self._shared_ssh`` (set by :meth:`execute`), then the + process-global pool, then opens fresh. """ if not self.testing: if self.execution_type != 'incore' and self.server != 'local': # If the job execution type is incore, then no need to upload any files. # Also, even if the job is submitted to the que, no need to upload files if the server is local. - with SSHClient(self.server) as ssh: - for up_file in self.files_to_upload: - logger.debug(f"Uploading {up_file['file_name']} source {up_file['source']} to {self.server}") - if up_file['source'] == 'path': - ssh.upload_file(remote_file_path=up_file['remote'], local_file_path=up_file['local']) - elif up_file['source'] == 'input_files': - ssh.upload_file(remote_file_path=up_file['remote'], file_string=up_file['local']) - else: - raise ValueError(f"Unclear file source for {up_file['file_name']}. Should either be 'path' or " - f"'input_files', got: {up_file['source']}") - if up_file['make_x']: - ssh.change_mode(mode='+x', file_name=up_file['file_name'], remote_path=self.remote_path) + if ssh is not None: + self._upload_with_ssh(ssh) + else: + with self._open_or_borrow_ssh() as borrowed: + self._upload_with_ssh(borrowed) else: # running locally, just copy the check file, if exists, to the job folder for up_file in self.files_to_upload: @@ -395,6 +473,25 @@ def upload_files(self): pass self.initial_time = datetime.datetime.now() + def _upload_with_ssh(self, ssh) -> None: + """SFTP-put every entry in ``self.files_to_upload`` over an open client. + + Factored out of :meth:`upload_files` so the with-shared vs. + with-new code paths share one body — adding a future per-file + knob (compression, retry, throttle) lands in one place. + """ + for up_file in self.files_to_upload: + logger.debug(f"Uploading {up_file['file_name']} source {up_file['source']} to {self.server}") + if up_file['source'] == 'path': + ssh.upload_file(remote_file_path=up_file['remote'], local_file_path=up_file['local']) + elif up_file['source'] == 'input_files': + ssh.upload_file(remote_file_path=up_file['remote'], file_string=up_file['local']) + else: + raise ValueError(f"Unclear file source for {up_file['file_name']}. Should either be 'path' or " + f"'input_files', got: {up_file['source']}") + if up_file['make_x']: + ssh.change_mode(mode='+x', file_name=up_file['file_name'], remote_path=self.remote_path) + def download_files(self): """ Download the relevant files. @@ -403,7 +500,7 @@ def download_files(self): if self.execution_type != 'incore' and self.server != 'local': # If the job execution type is incore, then no need to download any files. # Also, even if the job is submitted to the que, no need to download files if the server is local. - with SSHClient(self.server) as ssh: + with self._open_or_borrow_ssh() as ssh: for dl_file in self.files_to_download: ssh.download_file(remote_file_path=dl_file['remote'], local_file_path=dl_file['local']) self.set_initial_and_final_times(ssh=ssh) @@ -411,6 +508,16 @@ def download_files(self): self.set_initial_and_final_times() self.final_time = self.final_time or datetime.datetime.now() + def remove_remote_files(self): + """ + Remove the job's remote work directory after a successful run, to keep cluster quota in check. + No-op for local servers or when no remote_path is set. + """ + if self.server is None or self.server == 'local' or not self.remote_path: + return + with self._open_or_borrow_ssh() as ssh: + ssh.remove_dir(remote_path=self.remote_path) + def set_initial_and_final_times(self, ssh: SSHClient | None = None): """ Set the end time of the job. @@ -703,7 +810,7 @@ def delete(self): logger.debug(f'Deleting job {self.job_name} for {self.species_label}') if self.server != 'local': logger.debug(f'deleting job on {self.server}...') - with SSHClient(self.server) as ssh: + with self._open_or_borrow_ssh() as ssh: ssh.delete_job(self.job_id) else: logger.debug('deleting job locally...') @@ -766,20 +873,20 @@ def _get_additional_job_info(self): content = '' cluster_soft = servers[self.server]['cluster_soft'].lower() if cluster_soft in ['oge', 'sge', 'slurm', 'pbs', 'htcondor']: + # job.log is HTCondor's native event log; other clusters don't produce one. + include_job_log = cluster_soft == 'htcondor' local_file_path_1 = os.path.join(self.local_path, 'out.txt') local_file_path_2 = os.path.join(self.local_path, 'err.txt') - local_file_path_3 = os.path.join(self.local_path, 'job.log') + local_file_path_3 = os.path.join(self.local_path, 'job.log') if include_job_log else None if self.server != 'local' and self.remote_path is not None and not self.testing: - remote_file_path_1 = os.path.join(self.remote_path, 'out.txt') - remote_file_path_2 = os.path.join(self.remote_path, 'err.txt') - remote_file_path_3 = os.path.join(self.remote_path, 'job.log') - with SSHClient(self.server) as ssh: - for local_file_path, remote_file_path in zip([local_file_path_1, - local_file_path_2, - local_file_path_3], - [remote_file_path_1, - remote_file_path_2, - remote_file_path_3]): + remote_paths = [os.path.join(self.remote_path, 'out.txt'), + os.path.join(self.remote_path, 'err.txt')] + local_paths = [local_file_path_1, local_file_path_2] + if include_job_log: + remote_paths.append(os.path.join(self.remote_path, 'job.log')) + local_paths.append(local_file_path_3) + with self._open_or_borrow_ssh() as ssh: + for local_file_path, remote_file_path in zip(local_paths, remote_paths): try: ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path) @@ -789,7 +896,7 @@ def _get_additional_job_info(self): f'flags with stdout and stderr of out.txt and err.txt, respectively ' f'(e.g., "#SBATCH -o out.txt"). Error message:') logger.warning(e) - for local_file_path in [local_file_path_1, local_file_path_2, local_file_path_3]: + for local_file_path in filter(None, [local_file_path_1, local_file_path_2, local_file_path_3]): if os.path.isfile(local_file_path): with open(local_file_path, 'r') as f: lines = f.readlines() @@ -797,15 +904,14 @@ def _get_additional_job_info(self): content += '\n' else: raise ValueError(f'Unrecognized cluster software: {cluster_soft}') - if content: - self.additional_job_info = content.lower() + self.additional_job_info = content.lower() if content else None def _check_job_server_status(self) -> str: """ Possible statuses: ``initializing``, ``running``, ``errored on node xx``, ``done``. """ if self.server != 'local' and not self.testing: - with SSHClient(self.server) as ssh: + with self._open_or_borrow_ssh() as ssh: return ssh.check_job_status(self.job_id) else: return check_job_status(self.job_id) @@ -818,6 +924,10 @@ def _check_job_ess_status(self): Raises: IOError: If the output file and any additional server information cannot be found. """ + existing_keywords = list(self.job_status[1].get('keywords', list())) + # Refresh scheduler-side logs before ESS parsing so server-reported OOMs + # can be detected even when the output file is absent or incomplete. + self._get_additional_job_info() if self.server != 'local' and self.execution_type != 'incore': if os.path.exists(self.local_path_to_output_file): os.remove(self.local_path_to_output_file) @@ -856,7 +966,22 @@ def _check_job_ess_status(self): ) else: status, keywords, error, line = '', '', '', '' + if self.additional_job_info: + try: + status, keywords, error, line = determine_ess_status( + output_path=self.local_path_to_output_file, + species_label=self.species_label, + job_type=self.job_type, + job_log=self.additional_job_info, + software=self.job_adapter, + ) + except FileNotFoundError: + status, keywords, error, line = '', '', '', '' self.job_status[1]['status'] = status + if 'max_total_job_memory' in existing_keywords and status == 'errored' \ + and isinstance(keywords, list) and 'Memory' in keywords \ + and 'max_total_job_memory' not in keywords: + keywords.append('max_total_job_memory') self.job_status[1]['keywords'] = keywords self.job_status[1]['error'] = error self.job_status[1]['line'] = line.rstrip() diff --git a/arc/job/adapter_test.py b/arc/job/adapter_test.py index 3e3536ea09..9560237733 100644 --- a/arc/job/adapter_test.py +++ b/arc/job/adapter_test.py @@ -19,6 +19,12 @@ from arc.imports import settings from arc.job.adapter import JobAdapter, JobEnum, JobTypeEnum, JobExecutionTypeEnum from arc.job.adapters.gaussian import GaussianAdapter +from arc.job.ssh_pool import ( + SSHConnectionPool, + get_default_pool, + reset_default_pool, + set_default_pool, +) from arc.level import Level from arc.species import ARCSpecies @@ -182,6 +188,12 @@ def setUpClass(cls): server='server3', testing=True, ) + os.makedirs(cls.job_5.local_path, exist_ok=True) + fixture_path = os.path.join(ARC_TESTING_PATH, 'trsh', 'wall_exceeded.txt') + with open(fixture_path, 'r') as f: + log_content = f.read() + with open(os.path.join(cls.job_5.local_path, 'out.txt'), 'w') as f: + f.write(log_content) cls.job_6 = GaussianAdapter(execution_type='queue', job_name='opt_101', job_type='opt', @@ -250,6 +262,24 @@ def test_set_cpu_and_mem(self): self.assertEqual(self.job_4.submit_script_memory, expected_memory) self.job_4.server = 'local' + def test_set_cpu_and_mem_marks_max_total_job_memory(self): + """Test tagging jobs whose requested memory is clipped to the node cap.""" + job = GaussianAdapter(execution_type='queue', + job_type='opt', + level=Level(method='cbs-qb3'), + project='test', + project_directory=os.path.join(ARC_TESTING_PATH, 'test_JobAdapter'), + species=[ARCSpecies(label='spc1', xyz=['O 0 0 1'])], + server='server2', + job_memory_gb=300, + testing=True, + ) + + job.set_cpu_and_mem() + + self.assertAlmostEqual(job.job_memory_gb, 256 * 0.95) + self.assertIn('max_total_job_memory', job.job_status[1]['keywords']) + def test_set_file_paths(self): """Test setting up the job's paths""" self.assertEqual(self.job_1.local_path, os.path.join(self.job_1.project_directory, 'calcs', 'Species', @@ -323,6 +353,42 @@ def test_determine_job_status(self): self.assertEqual(self.job_5.job_status[1]['status'], 'errored') self.assertEqual(self.job_5.job_status[1]['keywords'], ['ServerTimeLimit']) + @patch('arc.job.adapter.determine_ess_status') + def test_preserve_max_total_job_memory_keyword(self, mock_determine_ess_status): + """Test preserving the max_total_job_memory marker across ESS status parsing.""" + self.job_4.job_status[1]['keywords'] = ['max_total_job_memory'] + self.job_4.initial_time = datetime.datetime.now() - datetime.timedelta(minutes=2) + self.job_4.final_time = datetime.datetime.now() - datetime.timedelta(minutes=1) + os.makedirs(self.job_4.local_path, exist_ok=True) + with open(self.job_4.local_path_to_output_file, 'w') as f: + f.write('dummy output') + mock_determine_ess_status.return_value = ( + 'errored', + ['MDCI', 'Memory'], + 'Insufficient job memory.', + 'Please increase MaxCore', + ) + + self.job_4._check_job_ess_status() + + self.assertEqual(self.job_4.job_status[1]['status'], 'errored') + self.assertEqual(self.job_4.job_status[1]['keywords'], ['MDCI', 'Memory', 'max_total_job_memory']) + + def test_check_job_ess_status_without_output_uses_job_log_memory_error(self): + """Test detecting server-reported memory errors even when the output file is absent.""" + if os.path.isfile(self.job_4.local_path_to_output_file): + os.remove(self.job_4.local_path_to_output_file) + self.job_4.initial_time = datetime.datetime.now() - datetime.timedelta(minutes=2) + self.job_4.final_time = datetime.datetime.now() - datetime.timedelta(minutes=1) + self.job_4.additional_job_info = '\tMEMORY EXCEEDED\n' + + with patch.object(self.job_4, '_get_additional_job_info'): + self.job_4._check_job_ess_status() + + self.assertEqual(self.job_4.job_status[1]['status'], 'errored') + self.assertEqual(self.job_4.job_status[1]['keywords'], ['Memory']) + self.assertEqual(self.job_4.job_status[1]['error'], 'Insufficient job memory.') + @patch( "arc.job.trsh.servers", { @@ -400,5 +466,367 @@ def test_multiple_rotations(self): self.assertEqual(len(archives), 2) +# --------------------------------------------------------------------------- +# SSH connection sharing & pooling (Options 1 + 2). +# +# Option 1 (per-job share): one SSHClient covers both upload and submit +# inside a single execute() call — collapses 2N connections to N. +# Option 2 (process-lifetime pool): the SSHClient for a given server is +# kept alive across jobs — collapses N to a small constant. +# --------------------------------------------------------------------------- + + +class _SSHClientStub: + """In-memory SSHClient lookalike for the pool to hand out. + + Records every upload/submit so tests can assert which calls landed + on which (shared) client. The pool calls ``connect()`` after + instantiation; we no-op that since there's no real socket. + """ + + def __init__(self, server): + self.server = server + self.uploaded = [] + self.submits = [] + self.downloaded = [] + self._closed = False + # Mimic SSHClient's ``_ssh`` attribute so ssh_pool._is_alive() + # finds an active fake-Transport. + self._ssh = _FakeParamikoSSH() + + def connect(self): + pass # the real one opens TCP+auth; we no-op for tests + + def close(self): + self._closed = True + self._ssh = None + + def __enter__(self): + return self + + def __exit__(self, *a): + return False + + def upload_file(self, *, remote_file_path, local_file_path=None, file_string=None): + self.uploaded.append(remote_file_path) + + def submit_job(self, remote_path, recursion=False): + self.submits.append(remote_path) + return 'initializing', 12345 + + def change_mode(self, *, mode, file_name, remote_path): + pass + + # Methods that the post-submit lifecycle paths exercise. + def check_job_status(self, job_id): + return 'running' + + def download_file(self, *, remote_file_path, local_file_path): + self.downloaded.append(remote_file_path) + + def remove_dir(self, *, remote_path): + pass + + def delete_job(self, job_id): + pass + + +class _FakeParamikoSSH: + """Stand-in for paramiko.SSHClient — _is_alive checks Transport.is_active().""" + def get_transport(self): + return _FakeTransport() + + +class _FakeTransport: + def is_active(self): + return True + + +class _StubFactoryPool: + """A pool whose factory builds _SSHClientStub instead of real SSHClient. + + Wraps the production ``SSHConnectionPool`` so reuse + lifecycle + semantics are exactly the production behavior — only the + underlying object is faked. + """ + + def __init__(self): + self.created = [] # log of every server name we built a client for + def factory(server): + client = _SSHClientStub(server) + self.created.append(server) + return client + self._inner = SSHConnectionPool(factory=factory) + + def borrow(self, server): + return self._inner.borrow(server) + + def close_all(self): + self._inner.close_all() + + @property + def opens(self): + return self._inner.opens + + @property + def borrows(self): + return self._inner.borrows + + +class _MinimalAdapter(JobAdapter): + """Concrete JobAdapter with just enough state to exercise execute(). + + Skips the heavyweight construction the GaussianAdapter does — we + only need ``server``, ``execution_type``, ``files_to_upload``, + ``remote_path``, and ``testing=False`` for the SSH-share path. + """ + + job_adapter = 'mockter' + + def __init__(self, *, server, execution_type='queue'): + # Bypass JobAdapter.__init__ entirely — all of its real work + # (file paths, settings, csv setup) is unrelated to the SSH + # share contract we're testing here. + self.server = server + self.execution_type = execution_type + self.testing = False + self.restarted = True # skip _write_initiated_job_to_csv_file + self.files_to_upload = [ + {'file_name': 'input.gjf', 'source': 'path', + 'local': '/local/input.gjf', 'remote': '/remote/input.gjf', 'make_x': False}, + {'file_name': 'submit.sh', 'source': 'path', + 'local': '/local/submit.sh', 'remote': '/remote/submit.sh', 'make_x': True}, + ] + self.remote_path = '/remote' + self.local_path = '/local' + self.job_status = ['initializing', {'status': 'initializing'}] + self.job_id = 0 + self.initial_time = None + self.final_time = None + self.job_name = 'job_test' + self.species_label = 'spc_test' + + # JobAdapter requires these abstracts; trivial bodies are fine. + def execute_incore(self): pass + def execute_queue(self): self.legacy_queue_execution() + def write_input_file(self): pass + def set_files(self): pass + def set_additional_file_paths(self): pass + def set_input_file_memory(self): pass + def upload_during_execution(self): pass + def _log_job_execution(self): pass + + +class TestSSHConnectionSharing(unittest.TestCase): + """``execute()`` shares one SSHClient per remote-queue job, and the + pool reuses it across jobs.""" + + def setUp(self): + # Inject a pool whose factory builds stubs, so the test never + # tries to open a real SSH connection to a server that isn't + # in this user's settings (e.g., 'server2'). + self._stub_pool = _StubFactoryPool() + set_default_pool(self._stub_pool) + # Also stub the legacy-direct path: bare + # ``legacy_queue_execution()`` (called outside execute()) uses + # the SSHClient class in ``arc.job.adapter`` directly, so patch + # that name with a context-manager wrapper around our stub. + self._direct_patch = patch( + 'arc.job.adapter.SSHClient', + _SSHClientStub, + ) + self._direct_patch.start() + + def tearDown(self): + set_default_pool(None) + self._direct_patch.stop() + + def test_remote_queue_opens_one_ssh_per_job(self): + """Upload + submit share a single SSHClient inside one execute().""" + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.execute() + # One SSHClient created (the pool's first borrow), one borrow. + self.assertEqual(self._stub_pool.opens, 1) + self.assertEqual(self._stub_pool.borrows, 1) + + def test_remote_queue_clears_shared_ssh_after_dispatch(self): + """``self._shared_ssh`` is None after execute() returns.""" + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.execute() + self.assertIsNone(getattr(adapter, '_shared_ssh', None)) + + def test_local_server_opens_no_ssh(self): + """local-server queue jobs use the host's queue, no SSH at all.""" + adapter = _MinimalAdapter(server='local', execution_type='queue') + with patch('arc.job.adapter.submit_job', return_value=('initializing', 99)): + adapter.execute() + self.assertEqual(self._stub_pool.opens, 0) + self.assertEqual(self._stub_pool.borrows, 0) + + def test_incore_opens_no_ssh(self): + """incore execution runs in-process — never touches SSH.""" + adapter = _MinimalAdapter(server='server2', execution_type='incore') + adapter.execute() + self.assertEqual(self._stub_pool.opens, 0) + + def test_legacy_queue_execution_routes_through_pool_when_called_directly(self): + """Even when called bare (outside execute()), legacy_queue_execution + now reuses the pool — that's Option 2's payoff for adapter + ``execute_queue`` overrides that call ``self.legacy_queue_execution()`` + from inside their own custom flow. + """ + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.legacy_queue_execution() # bare — no execute() wrapper + self.assertEqual(self._stub_pool.opens, 1) + self.assertEqual(self._stub_pool.borrows, 1) + + def test_shared_ssh_carries_uploads_and_submit(self): + """The pooled SSHClient sees both upload calls AND the submit call.""" + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.execute() + # Inspect the stub the pool kept. + self.assertEqual(self._stub_pool.opens, 1) + client = self._stub_pool._inner._clients['server2'] + self.assertEqual(len(client.uploaded), 2) + self.assertEqual(len(client.submits), 1) + + +class TestSSHConnectionPoolReuse(unittest.TestCase): + """The process-lifetime pool reuses one SSHClient across many jobs.""" + + def setUp(self): + self._stub_pool = _StubFactoryPool() + set_default_pool(self._stub_pool) + + def tearDown(self): + set_default_pool(None) + + def test_one_open_for_many_jobs_same_server(self): + """100 jobs against one server → 1 SSHClient, 100 borrows.""" + for _ in range(100): + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.execute() + self.assertEqual(self._stub_pool.opens, 1, "should reuse the same client") + self.assertEqual(self._stub_pool.borrows, 100) + + def test_separate_clients_per_distinct_server(self): + """Different servers → different clients, each opened once.""" + for _ in range(5): + _MinimalAdapter(server='server2', execution_type='queue').execute() + for _ in range(3): + _MinimalAdapter(server='server3', execution_type='queue').execute() + self.assertEqual(self._stub_pool.opens, 2) + self.assertEqual(self._stub_pool.borrows, 8) + self.assertEqual(sorted(self._stub_pool._inner._clients.keys()), + ['server2', 'server3']) + + def test_dead_client_is_reaped_and_reopened(self): + """If the underlying Transport reports inactive, pool reopens.""" + # First borrow → opens stub #1. + _MinimalAdapter(server='server2', execution_type='queue').execute() + client1 = self._stub_pool._inner._clients['server2'] + # Simulate a dead Transport (remote rebooted, etc.). + client1._ssh = None + # Next borrow should detect the dead client and open a fresh one. + _MinimalAdapter(server='server2', execution_type='queue').execute() + client2 = self._stub_pool._inner._clients['server2'] + self.assertIs(client1._closed, True, "stale client should be closed before reopen") + self.assertIsNot(client1, client2) + self.assertEqual(self._stub_pool.opens, 2) + + def test_close_all_closes_every_pooled_client(self): + for srv in ('server2', 'server3'): + _MinimalAdapter(server=srv, execution_type='queue').execute() + clients = list(self._stub_pool._inner._clients.values()) + self._stub_pool.close_all() + self.assertEqual(self._stub_pool._inner._clients, {}) + for c in clients: + self.assertTrue(c._closed) + + def test_close_all_is_idempotent(self): + _MinimalAdapter(server='server2', execution_type='queue').execute() + self._stub_pool.close_all() + # Second call must not raise or mutate state. + self._stub_pool.close_all() + self.assertEqual(self._stub_pool._inner._clients, {}) + + def test_status_poll_reuses_pooled_client(self): + """The hot path: hundreds of status checks open exactly one client. + + ARC polls a job's queue status every poll cycle for the entire + duration of the job. Pre-pool, each call opened a fresh + SSHClient. After Option 2, all polls reuse the pool's client + for that server — the dominant SSH-cost reducer in a real run. + """ + adapter = _MinimalAdapter(server='server2', execution_type='queue') + # Simulate 200 poll cycles (~1.5 hour run at 30s polling). + for _ in range(200): + adapter._check_job_server_status() + self.assertEqual(self._stub_pool.opens, 1, "pool should reuse one client") + self.assertEqual(self._stub_pool.borrows, 200) + + def test_download_files_reuses_pooled_client(self): + """download_files (called once per finished job) uses the pool too.""" + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.files_to_download = [ + {'remote': '/r/output.log', 'local': '/l/output.log'}, + ] + # set_initial_and_final_times reads file mtimes — stub it. + adapter.set_initial_and_final_times = lambda ssh=None: None + adapter.download_files() + client = self._stub_pool._inner._clients['server2'] + self.assertIn('/r/output.log', client.downloaded) + self.assertEqual(self._stub_pool.opens, 1) + + def test_full_lifecycle_one_open_per_server(self): + """Submit + many polls + download + cleanup all share one pooled client. + + End-to-end view of one job's life: this collapses what was + previously ~(2 + N_polls + 1 + 1) ≈ N+4 individual SSH + connections into a single reused client. + """ + adapter = _MinimalAdapter(server='server2', execution_type='queue') + adapter.files_to_download = [{'remote': '/r/o.log', 'local': '/l/o.log'}] + adapter.set_initial_and_final_times = lambda ssh=None: None + + adapter.execute() # upload + submit (1 borrow) + for _ in range(50): # 50 status polls + adapter._check_job_server_status() + adapter.download_files() # 1 download borrow + adapter.remove_remote_files() # 1 cleanup borrow + adapter.delete() # 1 delete borrow + + # All phases share the same pooled client. + self.assertEqual(self._stub_pool.opens, 1) + # 1 execute + 50 polls + 1 download + 1 cleanup + 1 delete = 54 borrows. + self.assertEqual(self._stub_pool.borrows, 54) + + +class TestSSHPoolDefaultLifecycle(unittest.TestCase): + """The module-level default pool is lazy and resettable.""" + + def setUp(self): + reset_default_pool() + + def tearDown(self): + reset_default_pool() + + def test_get_default_pool_is_idempotent(self): + p1 = get_default_pool() + p2 = get_default_pool() + self.assertIs(p1, p2) + + def test_reset_default_pool_drops_the_instance(self): + p1 = get_default_pool() + reset_default_pool() + p2 = get_default_pool() + self.assertIsNot(p1, p2) + + def test_set_default_pool_replaces_instance(self): + replacement = _StubFactoryPool() + set_default_pool(replacement) + self.assertIs(get_default_pool(), replacement) + + if __name__ == '__main__': unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/job/adapters/common.py b/arc/job/adapters/common.py index 06c8ccf605..852d14afc5 100644 --- a/arc/job/adapters/common.py +++ b/arc/job/adapters/common.py @@ -9,7 +9,6 @@ import sys import re -from pprint import pformat from typing import TYPE_CHECKING from arc.common import get_logger @@ -501,12 +500,7 @@ def set_job_args(args: dict | None, Returns: dict: The initialized job specific arguments. """ - # Ignore user-specified additional job arguments when troubleshooting. - if args is not None and args and any(val for val in args.values()) \ - and level is not None and level.args and any(val for val in level.args.values()): - logger.warning(f'When troubleshooting {job_name}, ARC ignores the following user-specified options:\n' - f'{pformat(level.args)}') - elif not args and level is not None: + if not args and level is not None: args = level.args for key in ['keyword', 'block', 'trsh']: if key not in args.keys(): diff --git a/arc/job/adapters/gaussian.py b/arc/job/adapters/gaussian.py index 314e796641..1d8076a8b8 100644 --- a/arc/job/adapters/gaussian.py +++ b/arc/job/adapters/gaussian.py @@ -218,6 +218,32 @@ def __init__(self, elif self.species[0].checkfile is not None and os.path.isfile(self.species[0].checkfile): self.checkfile = self.species[0].checkfile + def _user_requested_verytight(self) -> bool: + """ + Return True if the user passed ``verytight`` through ``self.args``. + + When True, the fine-mode opt-keyword builder will skip auto-adding + ``tight`` so the user's ``verytight`` wins unambiguously in the final + ``opt=(...)`` clause assembled by ``combine_parameters``. + + Scope: only the ``keyword`` channel is meaningful here — ``block`` is for + Gaussian input blocks and ``trsh`` is for ARC's troubleshooting layer, + neither of which is the right place for a user opt-cutoff override. + + Notes for the implementer: + - ``self.args['keyword']`` is a ``dict[str, str]`` (see + ``Level._check_args`` at arc/level.py:281). Values may be in any + case and may contain other keywords too (e.g. ``"opt=(verytight) + freq=hpmodes"``). + - Match must be word-bounded: a stray ``verytightscf`` should not + count, and ``tight`` alone must NOT match. + - Be defensive: ``self.args`` or ``self.args['keyword']`` may be + missing or empty depending on how the Level was constructed. + """ + keyword_args = (self.args or {}).get('keyword') or {} + joined = ' '.join(str(v) for v in keyword_args.values()) + return re.search(r'\bverytight\b', joined, re.IGNORECASE) is not None + def write_input_file(self) -> None: """ Write the input file to execute the job on the server. @@ -306,10 +332,15 @@ def write_input_file(self) -> None: if input_dict['trsh']: input_dict['trsh'] += ' ' input_dict['trsh'] += 'scf=(tight,direct)' + # 'no_tight' is set by trsh_keyword_loose_disp when a previous attempt hit + # MaxOptCycles with forces converged but displacement criteria unreachable. + # 'tight' is also dropped when the user requested verytight, which replaces it. + drop_tight = 'no_tight' in self.ess_trsh_methods or self._user_requested_verytight() + tight_kw = [] if drop_tight else ['tight'] if self.is_ts: - keywords.extend(['tight', 'maxstep=5']) + keywords.extend([*tight_kw, 'maxstep=5']) else: - keywords.extend(['tight', 'maxstep=5', f'maxcycle={max_c}']) + keywords.extend([*tight_kw, 'maxstep=5', f'maxcycle={max_c}']) input_dict['job_type_1'] = "opt" if self.level.method_type not in ['dft', 'composite', 'wavefunction']\ else f"opt=({', '.join(key for key in keywords)})" diff --git a/arc/job/adapters/gaussian_test.py b/arc/job/adapters/gaussian_test.py index 0a84d19372..6d2672eeca 100644 --- a/arc/job/adapters/gaussian_test.py +++ b/arc/job/adapters/gaussian_test.py @@ -1164,6 +1164,28 @@ def test_trsh_write_input_file(self): self.assertEqual(content_24, job_24_expected_input_file) + def test_user_requested_verytight(self): + """Detection only fires for word-bounded ``verytight`` in the keyword channel.""" + cases = [ + ({'keyword': {'opt': 'opt=(verytight)'}, 'block': {}, 'trsh': {}}, True), + ({'keyword': {'opt': 'opt=(VeryTight)'}, 'block': {}, 'trsh': {}}, True), + ({'keyword': {'general': 'opt=(calcfc)'}, 'block': {}, 'trsh': {}}, False), + ({'keyword': {'opt': 'opt=(tight)'}, 'block': {}, 'trsh': {}}, False), + ({'keyword': {'general': 'verytightscf'}, 'block': {}, 'trsh': {}}, False), + ({'keyword': {}, 'block': {}, 'trsh': {}}, False), + # verytight smuggled in via block/trsh must not count + ({'keyword': {}, 'block': {'1': 'opt=(verytight)'}, 'trsh': {}}, False), + ({'keyword': {}, 'block': {}, 'trsh': {'opt': 'opt=(verytight)'}}, False), + ] + original_args = self.job_3.args + try: + for args, expected in cases: + with self.subTest(args=args): + self.job_3.args = args + self.assertEqual(self.job_3._user_requested_verytight(), expected) + finally: + self.job_3.args = original_args + @classmethod def tearDownClass(cls): diff --git a/arc/job/adapters/orca_test.py b/arc/job/adapters/orca_test.py index 436d7b77cb..9a536eca9d 100644 --- a/arc/job/adapters/orca_test.py +++ b/arc/job/adapters/orca_test.py @@ -99,6 +99,34 @@ def test_set_input_file_memory(self): expected_memory = math.ceil(14 * 1024 / 8) self.assertEqual(self.job_1.input_file_memory, expected_memory) + def test_set_input_file_memory_with_configured_core_count(self): + """Test ORCA %%maxcore calculation for a configured total memory and cpu count.""" + original_memory = self.job_1.job_memory_gb + original_cpu_cores = self.job_1.cpu_cores + self.job_1.job_memory_gb = 250 + self.job_1.cpu_cores = 22 + self.job_1.set_input_file_memory() + self.assertEqual(self.job_1.input_file_memory, math.ceil(250 * 1024 / 22)) + self.job_1.job_memory_gb = original_memory + self.job_1.cpu_cores = original_cpu_cores + self.job_1.set_input_file_memory() + + def test_write_input_file_with_configured_core_count(self): + """Test rendering ORCA input for a configured total memory and cpu count.""" + original_memory = self.job_1.job_memory_gb + original_cpu_cores = self.job_1.cpu_cores + self.job_1.job_memory_gb = 250 + self.job_1.cpu_cores = 22 + self.job_1.set_input_file_memory() + self.job_1.write_input_file() + with open(os.path.join(self.job_1.local_path, input_filenames[self.job_1.job_adapter]), 'r') as f: + content = f.read() + self.assertIn('%maxcore 11637', content) + self.assertIn('%pal nprocs 22 end', content) + self.job_1.job_memory_gb = original_memory + self.job_1.cpu_cores = original_cpu_cores + self.job_1.set_input_file_memory() + def test_write_input_file(self): """Test writing Orca input files""" self.job_1.write_input_file() diff --git a/arc/job/adapters/scripts/gcn_script.py b/arc/job/adapters/scripts/gcn_script.py index dd52643e3c..bb71802375 100644 --- a/arc/job/adapters/scripts/gcn_script.py +++ b/arc/job/adapters/scripts/gcn_script.py @@ -2,26 +2,83 @@ # encoding: utf-8 """ -A standalone script to retrieve xyz data from GCN, +A standalone script to retrieve xyz TS guesses from GCN (the TS-GCN project), should be run under the ts_gcn environment. + +This script must NOT import any ``arc`` modules: it is executed with the +``ts_gcn`` environment's Python interpreter (see ``arc.job.env_run``), +where ARC and its dependencies are not installed. + +Two modes are supported: + +1. Direct (used by the GCN adapter's incore execution, one direction per call):: + + python gcn_script.py --r_sdf_path reactant.sdf --p_sdf_path product.sdf --ts_xyz_path TS.xyz + +2. Batch (used by queue execution via a YAML input file):: + + python gcn_script.py --yml_in_path input.yml + + where the YAML file contains the keys ``reactant_path``, ``product_path``, + ``local_path``, ``yml_out_path``, and ``repetitions``. """ import argparse import datetime import os +import sys +import traceback import yaml -from rdkit import Chem -from inference import inference + +def import_inference(): + """ + Import and return the ``inference`` function of the TS-GCN repository. + + The TS-GCN clone must be importable as the top-level ``inference`` module. + ARC's installers (devtools/install_gcn.sh, devtools/install_gcn_cpu.sh) + clone TS-GCN next to the ARC clone and export its path via ``~/.bashrc`` + or a conda activation hook, but launchers such as ``conda run`` source + neither, so fall back to well-known locations here. + + Returns: + callable: The ``inference.inference`` function. + + Raises: + ImportError: If the TS-GCN repository could not be located. + """ + candidate_dirs = list() + tsgcn_root = os.environ.get('TSGCN_ROOT') + if tsgcn_root: + candidate_dirs.append(tsgcn_root) + # This script lives at /arc/job/adapters/scripts/gcn_script.py, + # and the installers clone TS-GCN as a sibling of the ARC clone. + arc_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) + candidate_dirs.append(os.path.join(os.path.dirname(arc_root), 'TS-GCN')) + try: + from inference import inference + except ImportError: + for candidate in candidate_dirs: + if os.path.isfile(os.path.join(candidate, 'inference.py')) and candidate not in sys.path: + sys.path.insert(0, candidate) + try: + from inference import inference + except ImportError as e: + raise ImportError(f'Could not import the TS-GCN "inference" module. ' + f'Searched sys.path and the candidate TS-GCN locations {candidate_dirs}. ' + f'Make sure TS-GCN is cloned and importable in the ts_gcn environment ' + f'(see devtools/install_gcn.sh). Got: {e}') from e + return inference def initialize_gcn_run(input_dict: dict): """ - Initialize a GCN run. + Initialize a batch GCN run. Args: - input_dict (dict): The input dictionary. + input_dict (dict): The input dictionary with keys ``reactant_path``, ``product_path``, + ``local_path``, ``yml_out_path``, and ``repetitions``. """ ts_fwd_path = os.path.join(input_dict['local_path'], "TS_fwd.xyz") ts_rev_path = os.path.join(input_dict['local_path'], "TS_rev.xyz") @@ -47,10 +104,10 @@ def run_gcn_locally(direction: str, ts_path: str, ) -> dict: """ - Run GCN incore. + Run GCN in a single direction and package the result as a TS guess dictionary. Args: - direction (str): Either 'F' or 'R' for forward ort reverse directions, respectively. + direction (str): Either 'F' or 'R' for forward or reverse directions, respectively. reactant_path (str): The path to the reactant SDF file. product_path (str): The path to the product SDF file. ts_path (str): The path to the resulting TS guess file. @@ -63,11 +120,13 @@ def run_gcn_locally(direction: str, 'initial_xyz': None, } t0 = datetime.datetime.now() - run_gcn(r_sdf_path=reactant_path, - p_sdf_path=product_path, - ts_xyz_path=ts_path, - ) if os.path.isfile(ts_path): + os.remove(ts_path) + success = run_gcn(r_sdf_path=reactant_path, + p_sdf_path=product_path, + ts_xyz_path=ts_path, + ) + if success and os.path.isfile(ts_path): with open(ts_path, 'r') as f: tsg['initial_xyz'] = ''.join(f.readlines()[2:]) tsg['success'] = True @@ -80,19 +139,34 @@ def run_gcn_locally(direction: str, def run_gcn(r_sdf_path: str, p_sdf_path: str, ts_xyz_path: str, - ): + ) -> bool: + """ + Run a single GCN inference: read the reactant and product SDF files + and write the TS guess to ``ts_xyz_path``. + + Args: + r_sdf_path (str): The path to the reactant SDF file. + p_sdf_path (str): The path to the product SDF file. + ts_xyz_path (str): The path to write the TS guess to. + + Returns: + bool: Whether the inference completed without raising. + """ + from rdkit import Chem + inference = import_inference() r_mols = Chem.SDMolSupplier(r_sdf_path, removeHs=False, sanitize=True) p_mols = Chem.SDMolSupplier(p_sdf_path, removeHs=False, sanitize=True) try: inference(r_mols, p_mols, ts_xyz_path) - except: - print('\nGCN Failed\n') + except Exception as e: + print(f'GCN inference failed with: {e}\n{traceback.format_exc()}', file=sys.stderr) + return False + return True def read_yaml_file(path: str): """ - Read a YAML file (usually an input / restart file, but also conformers file) - and return the parameters as python variables. + Read a YAML file and return its content. Args: path (str): The YAML file path to read. @@ -136,9 +210,15 @@ def parse_command_line_arguments(command_line_args=None): This uses the :mod:`argparse` module, which ensures that the command-line arguments are sensible, parses them, and returns them. """ - parser = argparse.ArgumentParser(description='GCN') - parser.add_argument('--yml_in_path', metavar='input', type=str, default='input.yml', - help='A path to the YAML input file') + parser = argparse.ArgumentParser(description='GCN TS guess generation (TS-GCN)') + parser.add_argument('--yml_in_path', metavar='input', type=str, default=None, + help='A path to a YAML input file (batch mode)') + parser.add_argument('--r_sdf_path', metavar='reactant', type=str, default=None, + help='A path to the reactant SDF file (direct mode)') + parser.add_argument('--p_sdf_path', metavar='product', type=str, default=None, + help='A path to the product SDF file (direct mode)') + parser.add_argument('--ts_xyz_path', metavar='ts', type=str, default=None, + help='A path to write the TS guess XYZ file to (direct mode)') args = parser.parse_args(command_line_args) return args @@ -148,8 +228,19 @@ def main(): Run GCN to generate TS guesses. """ args = parse_command_line_arguments() - yml_in_path = str(args.yml_in_path) - initialize_gcn_run(input_dict=read_yaml_file(yml_in_path)) + if args.yml_in_path is not None: + initialize_gcn_run(input_dict=read_yaml_file(str(args.yml_in_path))) + elif args.r_sdf_path is not None and args.p_sdf_path is not None and args.ts_xyz_path is not None: + success = run_gcn(r_sdf_path=str(args.r_sdf_path), + p_sdf_path=str(args.p_sdf_path), + ts_xyz_path=str(args.ts_xyz_path), + ) + if not success or not os.path.isfile(str(args.ts_xyz_path)): + sys.exit(1) + else: + print('Either --yml_in_path, or all of --r_sdf_path, --p_sdf_path, and --ts_xyz_path must be given.', + file=sys.stderr) + sys.exit(2) if __name__ == '__main__': diff --git a/arc/job/adapters/scripts/kinbot_script.py b/arc/job/adapters/scripts/kinbot_script.py new file mode 100644 index 0000000000..32211c8675 --- /dev/null +++ b/arc/job/adapters/scripts/kinbot_script.py @@ -0,0 +1,424 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +A standalone script to generate TS guess geometries with KinBot, +should be run under the kinbot_env environment. + +This script must only import KinBot, PyYAML, and the standard library +(it runs in an environment where ARC is not installed). + +Input (YAML file passed via ``--yml_in_path``), keys: + charge (int): The well/reaction charge. + multiplicity (int): The well/reaction multiplicity. + families (list[str]): The KinBot reaction families to search. + wells (list[dict]): Entries are dicts with keys: + direction (str): 'F' or 'R', the direction this well corresponds to. + smiles (str): A SMILES representation of the well (resonance structure specific). + structure (list): The cartesian coordinates in the KinBot flat list format, + [symbol_1, x_1, y_1, z_1, symbol_2, ...]. + uma (dict, optional): UMA (FAIRChem) refinement options with keys: + refine (bool): Whether to refine successful TS guesses with a Sella + saddle-point search on the UMA potential (default: False). + model_path (str): A path to a local UMA checkpoint file (e.g., uma-s-1p2.pt). + model_name (str): A pretrained UMA model name to fetch from HuggingFace + (license-gated) if no model_path is given (default: uma-s-1p1). + task_name (str): The UMA task name (default: omol). + device (str): The torch device to use (default: cpu). + fmax (float): The Sella force convergence criterion (default: 0.005). + steps (int): The maximal number of Sella steps (default: 250). + yml_out_path (str): The path to which the results are saved. + +Output (YAML file written to ``yml_out_path``): a list of TS guess dicts with keys: + direction (str): 'F' or 'R' (copied from the corresponding well). + success (bool): Whether KinBot successfully generated a template-modified geometry. + coords (list[list[float]] or None): The TS guess cartesian coordinates (N x 3). + execution_time (str): The time it took to generate this guess. + uma_refined (bool): Whether the coordinates were refined on the UMA potential. + uma_error (str, optional): Why UMA refinement fell back to the unrefined guess. + error (str, optional): An error message if the well processing failed. +""" + +import argparse +import datetime +import json +import os +import sys +import tempfile +import traceback + +import yaml + +from kinbot.modify_geom import modify_coordinates +from kinbot.parameters import Parameters +from kinbot.qc import QuantumChemistry +from kinbot.reaction_finder import ReactionFinder +from kinbot.reaction_generator import ReactionGenerator +from kinbot.stationary_pt import StationaryPoint + + +def set_up_kinbot(smiles: str, + structure: list, + families: list, + multiplicity: int, + charge: int, + ) -> 'ReactionGenerator': + """ + Set up KinBot to run for a unimolecular reaction starting from the single well side. + + Args: + smiles (str): The SMILES representation of the unimolecular well to react. + structure (list): The cartesian coordinates of the well in the KinBot list format. + families (list): The specific KinBot families to try. + multiplicity (int): The well/reaction multiplicity. + charge (int): The well/reaction charge. + + Returns: + ReactionGenerator: The KinBot ReactionGenerator instance. + """ + par_dict = {'title': 'ARC', + # molecule information + 'smiles': smiles, + 'structure': structure, + 'charge': charge, + 'mult': multiplicity, + # steps + 'reaction_search': 1, + 'families': families, + 'pes': 0, + 'high_level': 0, + 'conformer_search': 0, + 'me': 0, + 'ringrange': [3, 9], + # Not used in ARC's flow (no KinBot QC jobs are ever spawned), but + # Parameters refuses to initialize without a barrier threshold (kcal/mol). + 'barrier_threshold': 200.0, + } + # Parameters validates its values at construction time (and calls sys.exit() on + # invalid ones), so the overrides must be supplied via a JSON input file rather + # than by mutating params.par after the fact. + fd, par_path = tempfile.mkstemp(suffix='.json', prefix='kinbot_params_', dir=os.getcwd(), text=True) + try: + with os.fdopen(fd, 'w') as f: + json.dump(par_dict, f) + params = Parameters(par_path) + finally: + os.remove(par_path) + + well = StationaryPoint(name='well0', + charge=charge, + mult=multiplicity, + structure=structure, + ) + + well.calc_chemid() + well.bond_mx() + well.find_cycle() + well.find_atom_eqv() + well.find_conf_dihedral() + + qc = QuantumChemistry(params.par) + rxn_finder = ReactionFinder(well, params.par, qc) + rxn_finder.find_reactions() + + reaction_generator = ReactionGenerator(species=well, + par=params.par, + qc=qc, + input_file=None, + ) + + return reaction_generator + + +def apply_kinbot_template(kinbot_rxn) -> tuple: + """ + Drive a KinBot reaction instance's constraint template over its full step sequence, + applying the geometry ``change`` constraints of every step to the evolving geometry. + + This mirrors what KinBot's ``reac_family.carry_out_reaction()`` does between the + constrained-optimization jobs it submits (including the short-instance ``skip`` + shortcut, the possible step bump returned by ``get_constraints``, and the ``'L'`` + change-entry prefix), just without running any QC relaxation in between. + Steps ``0 .. max_step - 1`` are the template modification steps; at + ``step == max_step`` KinBot runs a saddle-point search that does not modify the + template geometry, so the loop stops there. + + Args: + kinbot_rxn: The KinBot reaction instance (a GeneralReac subclass instance). + + Returns: + tuple: (success (bool), geom (N x 3 array-like)). + """ + geom = kinbot_rxn.species.geom + success = True + step = 0 + if kinbot_rxn.skip and len(kinbot_rxn.instance) < 4: + step = 12 + while step < kinbot_rxn.max_step: + step, fix, change, release = kinbot_rxn.get_constraints(step, geom) + + change_starting_zero = list() + for c in change: + if c[0] == 'L': + c_new = ['L'] + [ci - 1 for ci in c[1:-1]] + else: + c_new = [ci - 1 for ci in c[:-1]] + c_new.append(c[-1]) + change_starting_zero.append(c_new) + + if change_starting_zero and 'frozen' not in kinbot_rxn.instance_name: + step_success, geom = modify_coordinates(species=kinbot_rxn.species, + name=kinbot_rxn.instance_name, + geom=geom, + changes=change_starting_zero, + bond=kinbot_rxn.species.bond, + ) + success = success and bool(step_success) + step += 1 + return success, geom + + +def get_ts_guesses_for_well(well: dict, + families: list, + multiplicity: int, + charge: int, + ) -> list: + """ + Generate TS guesses for a single well. + + Args: + well (dict): The well dictionary with 'direction', 'smiles', and 'structure' keys. + families (list): The specific KinBot families to try. + multiplicity (int): The well/reaction multiplicity. + charge (int): The well/reaction charge. + + Returns: + list: TS guess dictionaries. + """ + results = list() + reaction_generator = set_up_kinbot(smiles=well['smiles'], + structure=well['structure'], + families=families, + multiplicity=multiplicity, + charge=charge, + ) + for kinbot_rxn in reaction_generator.species.reac_obj: + t0 = datetime.datetime.now() + success, coords = apply_kinbot_template(kinbot_rxn) + results.append({'direction': well['direction'], + 'success': bool(success), + 'coords': [[float(ci) for ci in coord] for coord in coords] if success else None, + 'execution_time': str(datetime.datetime.now() - t0), + 'uma_refined': False, + }) + return results + + +_UMA_PREDICT_UNIT = None + + +def _get_uma_calculator(uma: dict): + """ + Lazily import fairchem and build a FAIRChemCalculator on the UMA potential. + The (expensive to load) predict unit is cached at the module level, so multiple + guesses within one worker run reuse it. + + Args: + uma (dict): The UMA options (see the module docstring). + + Returns: + FAIRChemCalculator: The ASE calculator. + """ + global _UMA_PREDICT_UNIT + from fairchem.core import FAIRChemCalculator + if _UMA_PREDICT_UNIT is None: + device = uma.get('device') or 'cpu' + model_path = uma.get('model_path') or '' + if model_path: + from fairchem.core.units.mlip_unit import load_predict_unit + _UMA_PREDICT_UNIT = load_predict_unit(model_path, device=device) + else: + # Fetch a pretrained checkpoint from HuggingFace. UMA checkpoints are + # license-gated (require a HuggingFace login + accepting Meta's license), + # any failure here is caught by the caller and refinement is skipped. + from fairchem.core import pretrained_mlip + _UMA_PREDICT_UNIT = pretrained_mlip.get_predict_unit(uma.get('model_name') or 'uma-s-1p1', + device=device) + return FAIRChemCalculator(_UMA_PREDICT_UNIT, task_name=uma.get('task_name') or 'omol') + + +def refine_ts_guess_with_uma(symbols: list, + coords: list, + charge: int, + multiplicity: int, + uma: dict, + label: str, + ) -> tuple: + """ + Refine a TS guess with a Sella saddle-point search (order=1) on the UMA + (FAIRChem) machine-learned potential, mirroring KinBot 2.3.0's + ``tpl/ase_fc_ts_end.tpl.py`` template (without KinBot's frequency check — + ARC validates TS guesses downstream with its own frequency and IRC checks). + + fairchem is imported lazily in here only: the worker must keep functioning in a + kinbot_env without the fairchem extra installed. + + Args: + symbols (list): The chemical element symbols. + coords (list): The initial (template-modified) coordinates, N x 3. + charge (int): The species charge. + multiplicity (int): The species multiplicity. + uma (dict): The UMA options (see the module docstring). + label (str): A label used for the Sella log file name. + + Returns: + tuple: (refined coords (N x 3 list) or None, error message (str) or None). + """ + try: + from ase import Atoms + from sella import Sella + calc = _get_uma_calculator(uma) + except (ImportError, ModuleNotFoundError) as e: + return None, (f'UMA refinement was requested, but fairchem is not available in kinbot_env ' + f'({e}). Install it with "devtools/install_kinbot.sh --uma".') + except Exception as e: + traceback.print_exc(file=sys.stderr) + return None, f'Could not load the UMA model: {e.__class__.__name__}: {e}' + try: + mol = Atoms(symbols=symbols, positions=coords) + mol.info.update({'charge': charge, 'spin': multiplicity}) + mol.calc = calc + opt = Sella(mol, + order=1, + internal=len(symbols) >= 5, # mirrors KinBot's fc TS templates + logfile=f'{label}_uma_sella.log', + ) + converged = opt.run(fmax=float(uma.get('fmax') or 0.005), + steps=int(uma.get('steps') or 250)) + if not converged: + return None, 'The Sella saddle-point search on the UMA potential did not converge.' + return [[float(ci) for ci in coord] for coord in mol.positions], None + except Exception as e: + traceback.print_exc(file=sys.stderr) + return None, f'UMA refinement failed: {e.__class__.__name__}: {e}' + + +def run_kinbot(input_dict: dict) -> None: + """ + Run KinBot for all wells in the input dictionary and save the results. + + Args: + input_dict (dict): The input dictionary. + """ + results = list() + uma = input_dict.get('uma') or dict() + for well in input_dict['wells']: + try: + well_results = get_ts_guesses_for_well(well=well, + families=input_dict['families'], + multiplicity=input_dict['multiplicity'], + charge=input_dict['charge'], + ) + except Exception as e: + traceback.print_exc(file=sys.stderr) + results.append({'direction': well.get('direction'), + 'success': False, + 'coords': None, + 'execution_time': None, + 'uma_refined': False, + 'error': f'{e.__class__.__name__}: {e}', + }) + continue + if uma.get('refine'): + symbols = well['structure'][0::4] + for i, entry in enumerate(well_results): + if not entry['success']: + continue + refined_coords, uma_error = refine_ts_guess_with_uma( + symbols=symbols, + coords=entry['coords'], + charge=input_dict['charge'], + multiplicity=input_dict['multiplicity'], + uma=uma, + label=f"{well['direction']}_{i}", + ) + if refined_coords is not None: + entry['coords'] = refined_coords + entry['uma_refined'] = True + else: + # Fall back to the unrefined template guess, never fail the task. + entry['uma_error'] = uma_error + print(f"UMA refinement fell back to the unrefined guess for " + f"{well['direction']} {i}: {uma_error}", file=sys.stderr) + results.extend(well_results) + save_yaml_file(path=input_dict['yml_out_path'], content=results) + + +def read_yaml_file(path: str): + """ + Read a YAML file and return its content. + + Args: + path (str): The YAML file path to read. + + Returns: Union[dict, list] + The content read from the file. + """ + with open(path, 'r') as f: + content = yaml.load(stream=f, Loader=yaml.FullLoader) + return content + + +def save_yaml_file(path: str, + content: list, + ) -> None: + """ + Save a YAML file. + + Args: + path (str): The YAML file path to save. + content (list): The content to save. + """ + yaml.add_representer(str, string_representer) + yaml_str = yaml.dump(data=content) + with open(path, 'w') as f: + f.write(yaml_str) + + +def string_representer(dumper, data): + """ + Add a custom string representer to use block literals for multiline strings. + """ + if len(data.splitlines()) > 1: + return dumper.represent_scalar(tag='tag:yaml.org,2002:str', value=data, style='|') + return dumper.represent_scalar(tag='tag:yaml.org,2002:str', value=data) + + +def parse_command_line_arguments(command_line_args=None): + """ + Parse command-line arguments. + This uses the :mod:`argparse` module, which ensures that the command-line arguments are + sensible, parses them, and returns them. + """ + parser = argparse.ArgumentParser(description='KinBot') + parser.add_argument('--yml_in_path', metavar='input', type=str, default='input.yml', + help='A path to the YAML input file') + args = parser.parse_args(command_line_args) + return args + + +def main(): + """ + Run KinBot to generate TS guesses. + """ + args = parse_command_line_arguments() + yml_in_path = str(args.yml_in_path) + input_dict = read_yaml_file(yml_in_path) + # KinBot's QuantumChemistry writes a 'kinbot.db' file (and possibly other + # scratch files) to the current working directory, direct these to the job folder. + os.chdir(os.path.dirname(os.path.abspath(input_dict['yml_out_path']))) + run_kinbot(input_dict=input_dict) + + +if __name__ == '__main__': + main() diff --git a/arc/job/adapters/scripts/xtb_gsm/ograd b/arc/job/adapters/scripts/xtb_gsm/ograd index d208a7502f..f5c01f4f3a 100644 --- a/arc/job/adapters/scripts/xtb_gsm/ograd +++ b/arc/job/adapters/scripts/xtb_gsm/ograd @@ -24,3 +24,21 @@ tm2orca.py $basename rm xtbrestart cd .. +# ── Per-node provenance preservation (TCKDB path_search_result.points) ── +# tm2orca.py renames the xTB-generated Turbomole-format ``energy`` +# and ``gradient`` files (xTB writes its --grad output in Turbomole's +# on-disk text format; the calculation provenance is xTB, not Turbomole) +# to ``.energy`` and ``.gradient`` +# inside scratch/. The GSM binary then consumes the ORCA-shaped +# ``.engrad`` and may overwrite or remove the per-node files on +# subsequent calls. Copy them (plus the captured xtb stdout) into a +# stable side-effect directory at the run root so the TCKDB adapter's +# parser can recover per-node electronic energies and gradient metrics +# later. The copies are not consumed by GSM — the original scratch/ +# files stay in place unchanged for the algorithm. +node_label="$1" +preserve_dir="gsm_node_outputs" +mkdir -p "$preserve_dir" +[ -f "scratch/${basename}.energy" ] && cp -p "scratch/${basename}.energy" "$preserve_dir/${node_label}.energy" +[ -f "scratch/${basename}.gradient" ] && cp -p "scratch/${basename}.gradient" "$preserve_dir/${node_label}.gradient" +[ -f "scratch/${ofile}.xtbout" ] && cp -p "scratch/${ofile}.xtbout" "$preserve_dir/${node_label}.xtbout" diff --git a/arc/job/adapters/torch_ani_test.py b/arc/job/adapters/torch_ani_test.py index c34e47a4ef..b45f49da71 100644 --- a/arc/job/adapters/torch_ani_test.py +++ b/arc/job/adapters/torch_ani_test.py @@ -12,11 +12,13 @@ from arc.common import almost_equal_coords, almost_equal_lists, read_yaml_file from arc.job.adapters.torch_ani import TorchANIAdapter -from arc.settings.settings import tani_default_options_dict +from arc.settings.settings import TANI_PYTHON, tani_default_options_dict from arc.species import ARCSpecies from arc.species.vectors import calculate_distance, calculate_angle, calculate_dihedral_angle +@unittest.skipUnless(TANI_PYTHON is not None, + "tani_env conda environment not found; TorchANI adapter tests require it.") class TestTorchANIAdapter(unittest.TestCase): """ Contains unit tests for the TorchANIAdapter class. diff --git a/arc/job/adapters/ts/__init__.py b/arc/job/adapters/ts/__init__.py index be9397bd0d..ccaf52d3d9 100644 --- a/arc/job/adapters/ts/__init__.py +++ b/arc/job/adapters/ts/__init__.py @@ -1,7 +1,9 @@ import arc.job.adapters.ts.autotst_ts +import arc.job.adapters.ts.crest import arc.job.adapters.ts.gcn_ts import arc.job.adapters.ts.heuristics import arc.job.adapters.ts.kinbot_ts import arc.job.adapters.ts.linear import arc.job.adapters.ts.orca_neb +import arc.job.adapters.ts.seed_hub import arc.job.adapters.ts.xtb_gsm diff --git a/arc/job/adapters/ts/crest.py b/arc/job/adapters/ts/crest.py new file mode 100644 index 0000000000..d882511872 --- /dev/null +++ b/arc/job/adapters/ts/crest.py @@ -0,0 +1,522 @@ +""" +Utilities for running CREST within ARC. + +Separated from heuristics so CREST can be conditionally imported and reused. +""" + +import datetime +import os +import time +from typing import TYPE_CHECKING, List, Optional, Union + +from arc.common import almost_equal_coords, get_logger +from arc.imports import settings, submit_scripts +from arc.job.adapter import JobAdapter +from arc.job.adapters.common import _initialize_adapter, ts_adapters_by_rmg_family +from arc.job.adapters.ts.heuristics import DIHEDRAL_INCREMENT +from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints +from arc.job.factory import register_job_adapter +from arc.job.local import check_job_status, submit_job +from arc.plotter import save_geo +from arc.species.converter import reorder_xyz_string, str_to_xyz, xyz_to_str +from arc.species.species import ARCSpecies, TSGuess + +if TYPE_CHECKING: + from arc.level import Level + from arc.reaction import ARCReaction + +logger = get_logger() + +MAX_CHECK_INTERVAL_SECONDS = 100 + +CREST_PATH = settings.get("CREST_PATH", None) +CREST_ENV_PATH = settings.get("CREST_ENV_PATH", None) +SERVERS = settings.get("servers", {}) + + +def crest_available() -> bool: + """ + Return whether CREST is configured for use. + """ + return bool(SERVERS.get("local")) and bool(CREST_PATH or CREST_ENV_PATH) + + +class CrestAdapter(JobAdapter): + """ + A class for executing CREST TS conformer searches based on heuristics-generated guesses. + """ + + def __init__(self, + project: str, + project_directory: str, + job_type: Union[List[str], str], + args: Optional[dict] = None, + bath_gas: Optional[str] = None, + checkfile: Optional[str] = None, + conformer: Optional[int] = None, + constraints: Optional[List] = None, + cpu_cores: Optional[str] = None, + dihedral_increment: Optional[float] = None, + dihedrals: Optional[List[float]] = None, + directed_scan_type: Optional[str] = None, + ess_settings: Optional[dict] = None, + ess_trsh_methods: Optional[List[str]] = None, + execution_type: Optional[str] = None, + fine: bool = False, + initial_time: Optional[Union['datetime.datetime', str]] = None, + irc_direction: Optional[str] = None, + job_id: Optional[int] = None, + job_memory_gb: float = 14.0, + job_name: Optional[str] = None, + job_num: Optional[int] = None, + job_server_name: Optional[str] = None, + job_status: Optional[List[Union[dict, str]]] = None, + level: Optional['Level'] = None, + max_job_time: Optional[float] = None, + run_multi_species: bool = False, + reactions: Optional[List['ARCReaction']] = None, + rotor_index: Optional[int] = None, + server: Optional[str] = None, + server_nodes: Optional[list] = None, + queue: Optional[str] = None, + attempted_queues: Optional[List[str]] = None, + species: Optional[List[ARCSpecies]] = None, + testing: bool = False, + times_rerun: int = 0, + torsions: Optional[List[List[int]]] = None, + tsg: Optional[int] = None, + xyz: Optional[dict] = None, + ): + + self.incore_capacity = 50 + self.job_adapter = 'crest' + self.command = None + self.execution_type = execution_type or 'incore' + + if reactions is None: + raise ValueError('Cannot execute TS CREST without ARCReaction object(s).') + + dihedral_increment = dihedral_increment or DIHEDRAL_INCREMENT + + _initialize_adapter(obj=self, + is_ts=True, + project=project, + project_directory=project_directory, + job_type=job_type, + args=args, + bath_gas=bath_gas, + checkfile=checkfile, + conformer=conformer, + constraints=constraints, + cpu_cores=cpu_cores, + dihedral_increment=dihedral_increment, + dihedrals=dihedrals, + directed_scan_type=directed_scan_type, + ess_settings=ess_settings, + ess_trsh_methods=ess_trsh_methods, + fine=fine, + initial_time=initial_time, + irc_direction=irc_direction, + job_id=job_id, + job_memory_gb=job_memory_gb, + job_name=job_name, + job_num=job_num, + job_server_name=job_server_name, + job_status=job_status, + level=level, + max_job_time=max_job_time, + run_multi_species=run_multi_species, + reactions=reactions, + rotor_index=rotor_index, + server=server, + server_nodes=server_nodes, + queue=queue, + attempted_queues=attempted_queues, + species=species, + testing=testing, + times_rerun=times_rerun, + torsions=torsions, + tsg=tsg, + xyz=xyz, + ) + + def write_input_file(self) -> None: + pass + + def set_files(self) -> None: + pass + + def set_additional_file_paths(self) -> None: + pass + + def set_input_file_memory(self) -> None: + pass + + def execute_incore(self): + self._log_job_execution() + self.initial_time = self.initial_time if self.initial_time else datetime.datetime.now() + + supported_families = [key for key, val in ts_adapters_by_rmg_family.items() if 'crest' in val] + + self.reactions = [self.reactions] if not isinstance(self.reactions, list) else self.reactions + for rxn in self.reactions: + if rxn.family not in supported_families: + logger.warning(f'The CREST TS search adapter does not support the {rxn.family} reaction family.') + continue + if any(spc.get_xyz() is None for spc in rxn.r_species + rxn.p_species): + logger.warning(f'The CREST TS search adapter cannot process a reaction if 3D coordinates of ' + f'some/all of its reactants/products are missing.\nNot processing {rxn}.') + continue + if not crest_available(): + logger.warning('CREST is not available. Skipping CREST TS search.') + break + + if rxn.ts_species is None: + rxn.ts_species = ARCSpecies(label='TS', + is_ts=True, + charge=rxn.charge, + multiplicity=rxn.multiplicity, + ) + + tsg = TSGuess(method='CREST') + tsg.tic() + + crest_job_dirs = [] + xyz_guesses = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=self.dihedral_increment, + ) + if not xyz_guesses: + logger.warning(f'CREST TS search failed to generate any seed guesses for {rxn.label}.') + tsg.tok() + continue + + for iteration, xyz_entry in enumerate(xyz_guesses): + xyz_guess = xyz_entry.get("xyz") + family = xyz_entry.get("family", rxn.family) + if xyz_guess is None: + continue + + crest_constraint_atoms = get_wrapper_constraints( + wrapper='crest', + reaction=rxn, + seed=xyz_entry, + ) + if not crest_constraint_atoms: + logger.warning( + f"Could not determine CREST constraint atoms for {rxn.label} crest seed {iteration} " + f"(family: {family}). Skipping this CREST seed." + ) + continue + + crest_job_dir = crest_ts_conformer_search( + xyz_guess, + crest_constraint_atoms["A"], + crest_constraint_atoms["H"], + crest_constraint_atoms["B"], + path=self.local_path, + xyz_crest_int=iteration, + ) + crest_job_dirs.append(crest_job_dir) + + if not crest_job_dirs: + logger.warning(f'CREST TS search failed to prepare any jobs for {rxn.label}.') + tsg.tok() + continue + + crest_jobs = submit_crest_jobs(crest_job_dirs) + monitor_crest_jobs(crest_jobs) + xyz_guesses_crest = process_completed_jobs(crest_jobs) + tsg.tok() + + for method_index, xyz in enumerate(xyz_guesses_crest): + if xyz is None: + continue + unique = True + for other_tsg in rxn.ts_species.ts_guesses: + if almost_equal_coords(xyz, other_tsg.initial_xyz): + if hasattr(other_tsg, "method_sources"): + other_tsg.method_sources = other_tsg._normalize_method_sources( + (other_tsg.method_sources or []) + ["crest"] + ) + unique = False + break + if unique: + # CREST is run without an explicit method flag, i.e., at its GFN2-xTB default. + ts_guess = TSGuess(method='CREST', + index=len(rxn.ts_species.ts_guesses), + method_index=method_index, + t0=tsg.t0, + execution_time=tsg.execution_time, + success=True, + family=rxn.family, + xyz=xyz, + level={'method': 'gfn2-xtb', 'software': 'crest'}, + ) + rxn.ts_species.ts_guesses.append(ts_guess) + save_geo(xyz=xyz, + path=self.local_path, + filename=f'CREST_{method_index}', + format_='xyz', + comment=f'CREST {method_index}, family: {rxn.family}', + ) + + if len(self.reactions) < 5: + successes = [tsg for tsg in rxn.ts_species.ts_guesses if tsg.success and 'crest' in tsg.method.lower()] + if successes: + logger.info(f'CREST successfully found {len(successes)} TS guesses for {rxn.label}.') + else: + logger.info(f'CREST did not find any successful TS guesses for {rxn.label}.') + + self.final_time = datetime.datetime.now() + + def execute_queue(self): + self.execute_incore() + + +def crest_ts_conformer_search( + xyz_guess: dict, + a_atom: int, + h_atom: int, + b_atom: int, + path: str = "", + xyz_crest_int: int = 0, +) -> str: + """ + Prepare a CREST TS conformer search job: + - Write coords.ref and constraints.inp + - Write a PBS/HTCondor submit script using submit_scripts["local"]["crest"] + - Return the CREST job directory path + """ + path = os.path.join(path, f"crest_{xyz_crest_int}") + os.makedirs(path, exist_ok=True) + + # --- coords.ref --- + symbols = xyz_guess["symbols"] + converted_coords = reorder_xyz_string( + xyz_str=xyz_to_str(xyz_guess), + reverse_atoms=True, + convert_to="bohr", + ) + coords_ref_content = f"$coord\n{converted_coords}\n$end\n" + coords_ref_path = os.path.join(path, "coords.ref") + with open(coords_ref_path, "w") as f: + f.write(coords_ref_content) + + # --- constraints.inp --- + num_atoms = len(symbols) + # CREST uses 1-based indices + a_atom += 1 + h_atom += 1 + b_atom += 1 + + # All atoms not directly involved in A–H–B go into the metadynamics atom list + list_of_atoms_numbers_not_participating_in_reaction = [ + i for i in range(1, num_atoms + 1) if i not in [a_atom, h_atom, b_atom] + ] + + constraints_path = os.path.join(path, "constraints.inp") + with open(constraints_path, "w") as f: + f.write("$constrain\n") + f.write(f" atoms: {a_atom}, {h_atom}, {b_atom}\n") + f.write(" force constant: 0.5\n") + f.write(" reference=coords.ref\n") + f.write(f" distance: {a_atom}, {h_atom}, auto\n") + f.write(f" distance: {h_atom}, {b_atom}, auto\n") + f.write("$metadyn\n") + if list_of_atoms_numbers_not_participating_in_reaction: + f.write( + f' atoms: {", ".join(map(str, list_of_atoms_numbers_not_participating_in_reaction))}\n' + ) + f.write("$end\n") + + # --- build CREST command string --- + # Example: crest coords.ref --cinp constraints.inp --noreftopo -T 8 + local_server = SERVERS.get("local", {}) + cpus = int(local_server.get("cpus", 8)) + if CREST_ENV_PATH: + crest_exe = "crest" + else: + crest_exe = CREST_PATH if CREST_PATH is not None else "crest" + + commands = [ + crest_exe, + "coords.ref", + "--cinp constraints.inp", + "--noreftopo", + f"-T {cpus}", + ] + command = " ".join(commands) + + # --- activation line (optional) --- + activation_line = CREST_ENV_PATH or "" + + if SERVERS.get("local") is not None: + cluster_soft = SERVERS["local"]["cluster_soft"].lower() + local_templates = submit_scripts.get("local", {}) + crest_template = local_templates.get("crest") + crest_job_template = local_templates.get("crest_job") + + if cluster_soft in ["condor", "htcondor"]: + # HTCondor branch with a built-in fallback template. + if crest_template is None: + crest_template = ( + "universe = vanilla\n" + "executable = job.sh\n" + "output = out.txt\n" + "error = err.txt\n" + "log = log.txt\n" + "request_cpus = {cpus}\n" + "request_memory = {memory}\n" + "JobBatchName = {name}\n" + "queue\n" + ) + if crest_job_template is None: + crest_job_template = ( + "#!/bin/bash -l\n" + "{activation_line}\n" + "cd {path}\n" + "{commands}\n" + ) + sub_job = crest_template + format_params = { + "name": f"crest_{xyz_crest_int}", + "cpus": cpus, + "memory": int(SERVERS["local"].get("memory", 32.0) * 1024), + } + sub_job = sub_job.format(**format_params) + + with open( + os.path.join(path, settings["submit_filenames"]["HTCondor"]), "w" + ) as f: + f.write(sub_job) + + crest_job = crest_job_template.format( + path=path, + activation_line=activation_line, + commands=command, + ) + + with open(os.path.join(path, "job.sh"), "w") as f: + f.write(crest_job) + os.chmod(os.path.join(path, "job.sh"), 0o700) + + # Pre-create out/err for any status checkers that expect them + for fname in ("out.txt", "err.txt"): + fpath = os.path.join(path, fname) + if not os.path.exists(fpath): + with open(fpath, "w") as f: + f.write("") + os.chmod(fpath, 0o600) + + elif cluster_soft == "pbs": + # PBS branch with a built-in fallback template. + if crest_template is None: + crest_template = ( + "#!/bin/bash -l\n" + "#PBS -q {queue}\n" + "#PBS -N {name}\n" + "#PBS -l select=1:ncpus={cpus}:mem={memory}gb\n" + "#PBS -o out.txt\n" + "#PBS -e err.txt\n\n" + "{activation_line}\n" + "cd {path}\n" + "{commands}\n" + ) + sub_job = crest_template + format_params = { + "queue": SERVERS["local"].get("queue", "alon_q"), + "name": f"crest_{xyz_crest_int}", + "cpus": cpus, + # 'memory' is in GB for the template: mem={memory}gb + "memory": int( + SERVERS["local"].get("memory", 32) + if SERVERS["local"].get("memory", 32) < 60 + else 40 + ), + "activation_line": activation_line, + "path": path, + "commands": command, + } + sub_job = sub_job.format(**format_params) + + submit_filename = settings["submit_filenames"]["PBS"] # usually 'submit.sh' + submit_path = os.path.join(path, submit_filename) + with open(submit_path, "w") as f: + f.write(sub_job) + os.chmod(submit_path, 0o700) + + else: + raise ValueError(f"Unsupported cluster_soft for CREST: {cluster_soft!r}") + + return path + + +def submit_crest_jobs(crest_paths: List[str]) -> dict: + """ + Submit CREST jobs to the server. + + Args: + crest_paths (List[str]): List of paths to the CREST directories. + + Returns: + dict: A dictionary containing job IDs as keys and their statuses as values. + """ + crest_jobs = {} + for crest_path in crest_paths: + job_status, job_id = submit_job(path=crest_path) + logger.info(f"CREST job {job_id} submitted for {crest_path}") + crest_jobs[job_id] = {"path": crest_path, "status": job_status} + return crest_jobs + + +def monitor_crest_jobs(crest_jobs: dict, check_interval: int = 300) -> None: + """ + Monitor CREST jobs until they are complete. + + Args: + crest_jobs (dict): Dictionary containing job information (job ID, path, and status). + check_interval (int): Time interval (in seconds) to wait between status checks. + """ + while True: + all_done = True + for job_id, job_info in crest_jobs.items(): + if job_info["status"] not in ["done", "failed"]: + try: + job_info["status"] = check_job_status(job_id) # Update job status + except Exception as e: + logger.error(f"Error checking job status for job {job_id}: {e}") + job_info["status"] = "failed" + if job_info["status"] not in ["done", "failed"]: + all_done = False + if all_done: + break + time.sleep(min(check_interval, MAX_CHECK_INTERVAL_SECONDS)) + + +def process_completed_jobs(crest_jobs: dict) -> list: + """ + Process the completed CREST jobs and update XYZ guesses. + + Args: + crest_jobs (dict): Dictionary containing job information. + """ + xyz_guesses = [] + for job_id, job_info in crest_jobs.items(): + crest_path = job_info["path"] + if job_info["status"] == "done": + crest_best_path = os.path.join(crest_path, "crest_best.xyz") + if os.path.exists(crest_best_path): + with open(crest_best_path, "r") as f: + content = f.read() + xyz_guess = str_to_xyz(content) + xyz_guesses.append(xyz_guess) + else: + logger.error(f"crest_best.xyz not found in {crest_path}") + elif job_info["status"] == "failed": + logger.error(f"CREST job failed for {crest_path}") + + return xyz_guesses + +register_job_adapter('crest', CrestAdapter) diff --git a/arc/job/adapters/ts/crest_test.py b/arc/job/adapters/ts/crest_test.py new file mode 100644 index 0000000000..e243d8d43d --- /dev/null +++ b/arc/job/adapters/ts/crest_test.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +""" +Unit tests for arc.job.adapters.ts.crest +""" + +import os +import tempfile +import unittest + +from arc.species.converter import str_to_xyz + + +class TestCrestAdapter(unittest.TestCase): + """ + Tests for CREST input generation. + """ + + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.tmpdir.cleanup() + + def test_creates_valid_input_files(self): + """ + Ensure CREST inputs are written with expected content/format. + """ + from arc.job.adapters.ts import crest as crest_mod + + xyz = str_to_xyz( + """O 0.0 0.0 0.0 + H 0.0 0.0 0.96 + H 0.9 0.0 0.0""" + ) + + backups = { + "settings": crest_mod.settings, + "submit_scripts": crest_mod.submit_scripts, + "CREST_PATH": crest_mod.CREST_PATH, + "CREST_ENV_PATH": crest_mod.CREST_ENV_PATH, + "SERVERS": crest_mod.SERVERS, + } + + try: + crest_mod.settings = {"submit_filenames": {"PBS": "submit.sh"}} + crest_mod.submit_scripts = { + "local": { + "crest": ( + "#PBS -q {queue}\n" + "#PBS -N {name}\n" + "#PBS -l select=1:ncpus={cpus}:mem={memory}gb\n" + ), + "crest_job": "{activation_line}\ncd {path}\n{commands}\n", + } + } + crest_mod.CREST_PATH = "/usr/bin/crest" + crest_mod.CREST_ENV_PATH = "" + crest_mod.SERVERS = { + "local": {"cluster_soft": "pbs", "cpus": 4, "memory": 8, "queue": "testq"} + } + + crest_dir = crest_mod.crest_ts_conformer_search( + xyz_guess=xyz, a_atom=0, h_atom=1, b_atom=2, path=self.tmpdir.name, xyz_crest_int=0 + ) + + coords_path = os.path.join(crest_dir, "coords.ref") + constraints_path = os.path.join(crest_dir, "constraints.inp") + submit_path = os.path.join(crest_dir, "submit.sh") + + self.assertTrue(os.path.exists(coords_path)) + self.assertTrue(os.path.exists(constraints_path)) + self.assertTrue(os.path.exists(submit_path)) + + with open(coords_path) as f: + coords = f.read().strip().splitlines() + self.assertEqual(coords[0].strip(), "$coord") + self.assertEqual(coords[-1].strip(), "$end") + self.assertEqual(len(coords) - 2, len(xyz["symbols"])) + + with open(constraints_path) as f: + constraints = f.read() + self.assertIn("atoms: 1, 2, 3", constraints) + self.assertIn("force constant: 0.5", constraints) + self.assertIn("reference=coords.ref", constraints) + self.assertIn("distance: 1, 2, auto", constraints) + self.assertIn("distance: 2, 3, auto", constraints) + self.assertIn("$metadyn", constraints) + self.assertTrue(constraints.strip().endswith("$end")) + finally: + crest_mod.settings = backups["settings"] + crest_mod.submit_scripts = backups["submit_scripts"] + crest_mod.CREST_PATH = backups["CREST_PATH"] + crest_mod.CREST_ENV_PATH = backups["CREST_ENV_PATH"] + crest_mod.SERVERS = backups["SERVERS"] + + def test_creates_submit_file_without_crest_templates(self): + """ + Ensure fallback submit template generation works when submit.py has no CREST templates. + """ + from arc.job.adapters.ts import crest as crest_mod + + xyz = str_to_xyz( + """O 0.0 0.0 0.0 + H 0.0 0.0 0.96 + H 0.9 0.0 0.0""" + ) + + backups = { + "settings": crest_mod.settings, + "submit_scripts": crest_mod.submit_scripts, + "CREST_PATH": crest_mod.CREST_PATH, + "CREST_ENV_PATH": crest_mod.CREST_ENV_PATH, + "SERVERS": crest_mod.SERVERS, + } + + try: + crest_mod.settings = {"submit_filenames": {"PBS": "submit.sh"}} + crest_mod.submit_scripts = {"local": {}} + crest_mod.CREST_PATH = "/usr/bin/crest" + crest_mod.CREST_ENV_PATH = "" + crest_mod.SERVERS = { + "local": {"cluster_soft": "pbs", "cpus": 4, "memory": 8, "queue": "testq"} + } + + crest_dir = crest_mod.crest_ts_conformer_search( + xyz_guess=xyz, a_atom=0, h_atom=1, b_atom=2, path=self.tmpdir.name, xyz_crest_int=1 + ) + + submit_path = os.path.join(crest_dir, "submit.sh") + self.assertTrue(os.path.exists(submit_path)) + with open(submit_path) as f: + submit_text = f.read() + self.assertIn("#PBS -q testq", submit_text) + self.assertIn("coords.ref --cinp constraints.inp --noreftopo -T 4", submit_text) + finally: + crest_mod.settings = backups["settings"] + crest_mod.submit_scripts = backups["submit_scripts"] + crest_mod.CREST_PATH = backups["CREST_PATH"] + crest_mod.CREST_ENV_PATH = backups["CREST_ENV_PATH"] + crest_mod.SERVERS = backups["SERVERS"] + + +if __name__ == "__main__": + unittest.main() diff --git a/arc/job/adapters/ts/gcn_test.py b/arc/job/adapters/ts/gcn_test.py index 8524b00db2..5cb15b6fec 100644 --- a/arc/job/adapters/ts/gcn_test.py +++ b/arc/job/adapters/ts/gcn_test.py @@ -5,95 +5,219 @@ This module contains unit tests of the arc.job.adapters.ts.gcn module """ +import importlib.util import os import shutil +import subprocess import unittest +from unittest.mock import patch -from arc.common import ARC_TESTING_PATH +from arc.common import ARC_PATH, ARC_TESTING_PATH, read_yaml_file import arc.job.adapters.ts.gcn_ts as ts_gcn +from arc.job.adapters.ts.gcn_ts import GCNAdapter from arc.reaction import ARCReaction +from arc.settings.settings import TS_GCN_PYTHON from arc.species.converter import str_to_xyz from arc.species.species import ARCSpecies, TSGuess +GCN_SCRIPT_PATH = os.path.join(ARC_PATH, 'arc', 'job', 'adapters', 'scripts', 'gcn_script.py') + +TS_XYZ_F = """3 + +O 0.00000000 0.00000000 0.00000000 +H 0.00000000 0.00000000 0.97000000 +H 0.94000000 0.00000000 -0.24000000 +""" + +TS_XYZ_R = """3 + +O 0.00000000 0.00000000 0.10000000 +H 0.00000000 0.10000000 1.10000000 +H 1.05000000 0.00000000 -0.35000000 +""" + + +def load_gcn_script(): + """Load the standalone gcn_script.py (not a package module) for testing its plumbing.""" + spec = importlib.util.spec_from_file_location('gcn_script', GCN_SCRIPT_PATH) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def fake_run_in_conda_env_factory(returncode: int = 0, write_ts_file: bool = True): + """ + Return a fake run_in_conda_env() side effect that mimics gcn_script.py: + it writes the TS xyz file passed via --ts_xyz_path (direction-dependent content) + and returns a CompletedProcess with the requested return code. + """ + def fake_run_in_conda_env(python_executable, script_path, *script_args): + args = list(script_args) + ts_xyz_path = args[args.index('--ts_xyz_path') + 1] + if write_ts_file: + with open(ts_xyz_path, 'w') as f: + f.write(TS_XYZ_F if 'fwd' in os.path.basename(ts_xyz_path).lower() else TS_XYZ_R) + return subprocess.CompletedProcess(args=[python_executable, script_path] + args, + returncode=returncode, stdout='', stderr='') + return fake_run_in_conda_env + class TestGCNAdapter(unittest.TestCase): """ Contains unit tests for the GCNAdapter class. """ - @classmethod - def setUpClass(cls): + def setUp(self): """ - A method that is run before all unit tests in this class. + A method that is run before each unit test in this class. + Tests run in parallel (pytest-xdist), so each test gets its own directory. """ - cls.maxDiff = None - cls.output_dir = os.path.join(ARC_TESTING_PATH, 'GCN') - if not os.path.isdir(cls.output_dir): - os.makedirs(cls.output_dir) - cls.rxn_1 = ARCReaction(r_species=[ARCSpecies(label='nC3H7', smiles='[CH2]CC')], - p_species=[ARCSpecies(label='iC3H7', smiles='C[CH]C')]) - cls.reactant_path = os.path.join(cls.output_dir, 'react.sdf') - cls.product_path = os.path.join(cls.output_dir, 'prod.sdf') - cls.ts_path = os.path.join(cls.output_dir, 'ts.xyz') + self.maxDiff = None + self.output_dir = os.path.join(ARC_TESTING_PATH, f'GCN_{self._testMethodName}') + os.makedirs(self.output_dir, exist_ok=True) + self.addCleanup(shutil.rmtree, self.output_dir, ignore_errors=True) + self.reactant_path = os.path.join(self.output_dir, 'react.sdf') + self.product_path = os.path.join(self.output_dir, 'prod.sdf') + self.ts_fwd_path = os.path.join(self.output_dir, 'TS_fwd.xyz') + + @staticmethod + def get_reaction() -> ARCReaction: + """Get a fresh isomerization reaction instance.""" + return ARCReaction(r_species=[ARCSpecies(label='nC3H7', smiles='[CH2]CC')], + p_species=[ARCSpecies(label='iC3H7', smiles='C[CH]C')]) + + def get_adapter(self, rxn: ARCReaction) -> GCNAdapter: + """Get a GCNAdapter instance for testing.""" + project_dir = os.path.join(self.output_dir, 'project') + return GCNAdapter(job_type='tsg', + reactions=[rxn], + testing=True, + project='test_GCNAdapter', + project_directory=project_dir, + dihedral_increment=1, + ) + + def test_gcn_available(self): + """Test the gcn_available() function.""" + self.assertTrue(os.path.isfile(ts_gcn.GCN_SCRIPT_PATH)) + with patch.object(ts_gcn, 'TS_GCN_PYTHON', __file__): + # Any existing file stands in for the interpreter. + self.assertTrue(ts_gcn.gcn_available()) + with patch.object(ts_gcn, 'TS_GCN_PYTHON', None): + self.assertFalse(ts_gcn.gcn_available()) + with patch.object(ts_gcn, 'TS_GCN_PYTHON', '/path/does/not/exist/python'): + self.assertFalse(ts_gcn.gcn_available()) def test_write_sdf_files(self): """Test the write_sdf_files() function.""" - self.assertFalse(os.path.isfile(self.reactant_path)) - self.assertFalse(os.path.isfile(self.product_path)) - ts_gcn.write_sdf_files(rxn=self.rxn_1, + rxn = self.get_reaction() + ts_gcn.write_sdf_files(rxn=rxn, reactant_path=self.reactant_path, product_path=self.product_path, ) + self.assertTrue(os.path.isfile(self.reactant_path)) + self.assertTrue(os.path.isfile(self.product_path)) with open(self.reactant_path, 'r') as f: content_r_sdf = f.read() - with open(self.product_path, 'r') as f: - content_p_sdf = f.read() - expected_r_atoms = sorted(list(self.rxn_1.r_species[0].get_xyz()["symbols"]), key=ord) - expected_p_atoms = sorted(list(self.rxn_1.p_species[0].get_xyz()["symbols"]), key=ord) - r_xyz_atoms = sorted(list(filter(("").__ne__, content_r_sdf.split(" "))), key =len, reverse= False) - p_xyz_atoms = sorted(list(filter(("").__ne__, content_p_sdf.split(" "))), key =len, reverse= False) - i = 0 - r_atoms = [] - while len(r_xyz_atoms[i]) == 1: - if r_xyz_atoms[i] in ["C", "H", "O", "N"]: - r_atoms.append(r_xyz_atoms[i]) - i += 1 - i = 0 - p_atoms = [] - while len(p_xyz_atoms[i]) == 1: - if p_xyz_atoms[i] in ["C", "H", "O", "N"]: - p_atoms.append(p_xyz_atoms[i]) - i += 1 - self.assertEqual(r_atoms, expected_r_atoms) - self.assertEqual(p_atoms, expected_p_atoms) - - def test_run_subprocess_locally(self): - """Test the run_subprocess_locally() function""" - self.assertFalse(os.path.isfile(self.ts_path)) - self.rxn_1.ts_species = ARCSpecies(label='TS', is_ts=True) - ts_gcn.run_subprocess_locally(direction='F', - reactant_path=self.reactant_path, - product_path=self.product_path, - ts_path=self.ts_path, - local_path=self.output_dir, - ts_species=self.rxn_1.ts_species, - ) - if os.path.isfile(self.ts_path): - # The gcn_env automated creation isn't robust, only test this if an output file was generated. - xyz = str_to_xyz(self.ts_path) - self.assertEqual(xyz['symbols'], ('C', 'C', 'C', 'H', 'H', 'H', 'H', 'H', 'H', 'H')) + # 10 atoms (C3H7), the atom count appears in the SDF counts line. + self.assertIn(' 10 9', content_r_sdf) + + def test_run_subprocess_locally_success(self): + """Test run_subprocess_locally() passing the correct arguments and parsing the output.""" + ts_gcn.write_sdf_files(rxn=self.get_reaction(), + reactant_path=self.reactant_path, + product_path=self.product_path, + ) + ts_species = ARCSpecies(label='TS', is_ts=True) + with patch.object(ts_gcn, 'TS_GCN_PYTHON', '/fake/envs/ts_gcn/bin/python'), \ + patch.object(ts_gcn, 'run_in_conda_env', + side_effect=fake_run_in_conda_env_factory()) as mock_run: + ts_gcn.run_subprocess_locally(direction='F', + reactant_path=self.reactant_path, + product_path=self.product_path, + ts_path=self.ts_fwd_path, + local_path=self.output_dir, + ts_species=ts_species, + ) + self.assertEqual(mock_run.call_count, 1) + args = mock_run.call_args.args + self.assertEqual(args[0], '/fake/envs/ts_gcn/bin/python') + self.assertEqual(args[1], ts_gcn.GCN_SCRIPT_PATH) + # The reactant SDF must be passed to --r_sdf_path and the product SDF to --p_sdf_path. + flags = dict(zip(args[2::2], args[3::2])) + self.assertEqual(flags['--r_sdf_path'], self.reactant_path) + self.assertEqual(flags['--p_sdf_path'], self.product_path) + self.assertEqual(flags['--ts_xyz_path'], self.ts_fwd_path) + self.assertEqual(len(ts_species.ts_guesses), 1) + tsg = ts_species.ts_guesses[0] + self.assertTrue(tsg.success) + self.assertEqual(tsg.method, 'gcn') + self.assertEqual(tsg.method_direction, 'F') + self.assertEqual(tsg.initial_xyz['symbols'], ('O', 'H', 'H')) + + def test_run_subprocess_locally_failure(self): + """Test that a failing GCN subprocess is handled gracefully.""" + ts_species = ARCSpecies(label='TS', is_ts=True) + with patch.object(ts_gcn, 'TS_GCN_PYTHON', '/fake/envs/ts_gcn/bin/python'), \ + patch.object(ts_gcn, 'run_in_conda_env', + side_effect=fake_run_in_conda_env_factory(returncode=1, write_ts_file=False)) as mock_run: + ts_gcn.run_subprocess_locally(direction='R', + reactant_path=self.product_path, + product_path=self.reactant_path, + ts_path=os.path.join(self.output_dir, 'TS_rev.xyz'), + local_path=self.output_dir, + ts_species=ts_species, + ) + self.assertEqual(mock_run.call_count, 1) + self.assertEqual(len(ts_species.ts_guesses), 0) + + def test_execute_incore(self): + """Test the full incore execution path through the mocked subprocess boundary.""" + rxn = self.get_reaction() + adapter = self.get_adapter(rxn) + with patch.object(ts_gcn, 'TS_GCN_PYTHON', '/fake/envs/ts_gcn/bin/python'), \ + patch.object(ts_gcn, 'gcn_available', return_value=True), \ + patch.object(ts_gcn, 'run_in_conda_env', + side_effect=fake_run_in_conda_env_factory()) as mock_run: + adapter.execute_incore() + # dihedral_increment=1 -> one repetition -> one forward + one reverse call. + self.assertEqual(mock_run.call_count, 2) + # The input SDF files must have been written before calling the subprocess. + self.assertTrue(os.path.isfile(adapter.reactant_path)) + self.assertTrue(os.path.isfile(adapter.product_path)) + fwd_flags = dict(zip(mock_run.call_args_list[0].args[2::2], mock_run.call_args_list[0].args[3::2])) + rev_flags = dict(zip(mock_run.call_args_list[1].args[2::2], mock_run.call_args_list[1].args[3::2])) + self.assertEqual(fwd_flags['--r_sdf_path'], adapter.reactant_path) + self.assertEqual(fwd_flags['--p_sdf_path'], adapter.product_path) + self.assertEqual(fwd_flags['--ts_xyz_path'], adapter.ts_fwd_path) + # The reverse direction swaps reactant and product. + self.assertEqual(rev_flags['--r_sdf_path'], adapter.product_path) + self.assertEqual(rev_flags['--p_sdf_path'], adapter.reactant_path) + self.assertEqual(rev_flags['--ts_xyz_path'], adapter.ts_rev_path) + self.assertEqual(len(rxn.ts_species.ts_guesses), 2) + self.assertEqual(rxn.ts_species.ts_guesses[0].method_direction, 'F') + self.assertEqual(rxn.ts_species.ts_guesses[1].method_direction, 'R') + self.assertTrue(all(tsg.success for tsg in rxn.ts_species.ts_guesses)) + + def test_execute_incore_gcn_unavailable(self): + """Test that execution degrades gracefully (no crash, no subprocess) when the ts_gcn env is missing.""" + rxn = self.get_reaction() + adapter = self.get_adapter(rxn) + with patch.object(ts_gcn, 'TS_GCN_PYTHON', None), \ + patch.object(ts_gcn, 'run_in_conda_env') as mock_run: + adapter.execute_incore() + mock_run.assert_not_called() + self.assertTrue(rxn.ts_species is None or len(rxn.ts_species.ts_guesses) == 0) def test_process_tsg(self): """Test the process_tsg() function.""" expected_ts_xyz_path = os.path.join(self.output_dir, 'GCN R 0.xyz') - if os.path.isfile(expected_ts_xyz_path): - os.remove(expected_ts_xyz_path) - self.rxn_1.ts_species = ARCSpecies(label='TS', is_ts=True) + ts_species = ARCSpecies(label='TS', is_ts=True) ts_gcn.process_tsg(direction='R', ts_xyz=str_to_xyz(os.path.join(ARC_TESTING_PATH, 'opt', 'TS_nC3H7-iC3H7.out')), local_path=self.output_dir, - ts_species=self.rxn_1.ts_species, - tsg=TSGuess(method=f'GCN', + ts_species=ts_species, + tsg=TSGuess(method='GCN', method_direction='R', index=0, ), @@ -101,14 +225,78 @@ def test_process_tsg(self): self.assertTrue(os.path.isfile(expected_ts_xyz_path)) xyz = str_to_xyz(expected_ts_xyz_path) self.assertEqual(xyz['symbols'], ('C', 'C', 'C', 'H', 'H', 'H', 'H', 'H', 'H', 'H')) + self.assertEqual(len(ts_species.ts_guesses), 1) + + +class TestGCNScript(unittest.TestCase): + """ + Contains unit tests for the standalone gcn_script.py + (its plumbing only; the actual inference import requires the ts_gcn env). + """ - @classmethod - def tearDownClass(cls): + def setUp(self): """ - A function that is run ONCE after all unit tests in this class. - Delete all project directories created during these unit tests. + A method that is run before each unit test in this class. """ - shutil.rmtree(os.path.join(ARC_TESTING_PATH, 'GCN'), ignore_errors=True) + self.maxDiff = None + self.output_dir = os.path.join(ARC_TESTING_PATH, f'GCN_script_{self._testMethodName}') + os.makedirs(self.output_dir, exist_ok=True) + self.addCleanup(shutil.rmtree, self.output_dir, ignore_errors=True) + self.gcn_script = load_gcn_script() + + def test_no_arc_imports(self): + """gcn_script.py runs in the ts_gcn env and must not import any arc modules.""" + with open(GCN_SCRIPT_PATH, 'r') as f: + content = f.read() + self.assertNotIn('import arc', content) + self.assertNotIn('from arc', content) + + def test_parse_command_line_arguments(self): + """Test both CLI modes of gcn_script.py.""" + args = self.gcn_script.parse_command_line_arguments(['--yml_in_path', 'input.yml']) + self.assertEqual(args.yml_in_path, 'input.yml') + self.assertIsNone(args.r_sdf_path) + args = self.gcn_script.parse_command_line_arguments( + ['--r_sdf_path', 'r.sdf', '--p_sdf_path', 'p.sdf', '--ts_xyz_path', 'ts.xyz']) + self.assertIsNone(args.yml_in_path) + self.assertEqual(args.r_sdf_path, 'r.sdf') + self.assertEqual(args.p_sdf_path, 'p.sdf') + self.assertEqual(args.ts_xyz_path, 'ts.xyz') + + def test_initialize_gcn_run(self): + """Test the batch (queue) mode: input dict in, TS guess list YAML out.""" + yml_out_path = os.path.join(self.output_dir, 'output.yml') + input_dict = {'reactant_path': os.path.join(self.output_dir, 'reactant.sdf'), + 'product_path': os.path.join(self.output_dir, 'product.sdf'), + 'local_path': self.output_dir, + 'yml_out_path': yml_out_path, + 'repetitions': 2, + } + + def fake_run_gcn(r_sdf_path, p_sdf_path, ts_xyz_path): + with open(ts_xyz_path, 'w') as f: + f.write(TS_XYZ_F) + return True + + original_run_gcn = self.gcn_script.run_gcn + self.gcn_script.run_gcn = fake_run_gcn + self.addCleanup(setattr, self.gcn_script, 'run_gcn', original_run_gcn) + self.gcn_script.initialize_gcn_run(input_dict=input_dict) + self.assertTrue(os.path.isfile(yml_out_path)) + tsgs = read_yaml_file(yml_out_path) + self.assertEqual(len(tsgs), 4) # 2 repetitions x 2 directions + self.assertEqual([tsg['method_direction'] for tsg in tsgs], ['F', 'R', 'F', 'R']) + for tsg in tsgs: + self.assertEqual(tsg['method'], 'GCN') + self.assertTrue(tsg['success']) + # The two xyz header lines must be stripped. + self.assertEqual(tsg['initial_xyz'].split()[0], 'O') + + def test_import_inference_raises_informatively(self): + """Without the ts_gcn env, import_inference() must raise an informative ImportError.""" + with self.assertRaises(ImportError) as cm: + self.gcn_script.import_inference() + self.assertIn('TS-GCN', str(cm.exception)) if __name__ == '__main__': diff --git a/arc/job/adapters/ts/gcn_ts.py b/arc/job/adapters/ts/gcn_ts.py index 98df155916..1b0f5add10 100644 --- a/arc/job/adapters/ts/gcn_ts.py +++ b/arc/job/adapters/ts/gcn_ts.py @@ -23,12 +23,6 @@ from arc.species.converter import rdkit_conf_from_mol, str_to_xyz from arc.species.species import ARCSpecies, TSGuess, colliding_atoms -HAS_GCN = True -try: - from inference import inference -except (ImportError, ModuleNotFoundError): - HAS_GCN = False - if TYPE_CHECKING: from arc.level import Level from arc.reaction import ARCReaction @@ -43,6 +37,20 @@ logger = get_logger() +def gcn_available() -> bool: + """ + Determine whether the GCN subprocess boundary is usable: + the ``ts_gcn`` environment's Python interpreter (``TS_GCN_PYTHON``) + resolves to an existing executable, and the standalone GCN script exists. + + Returns: + bool: Whether GCN can be executed. + """ + return (TS_GCN_PYTHON is not None + and os.path.isfile(TS_GCN_PYTHON) + and os.path.isfile(GCN_SCRIPT_PATH)) + + class GCNAdapter(JobAdapter): """ A class for executing GCN (graph convolutional network) jobs. @@ -272,6 +280,13 @@ def execute_gcn(self, exe_type: str = 'incore'): Args: exe_type (str, optional): Whether to execute 'incore' or 'queue'. """ + if not gcn_available(): + logger.warning(f'GCN is not available: TS_GCN_PYTHON is {TS_GCN_PYTHON}, ' + f'expected the python executable of the "ts_gcn" environment ' + f'(and {GCN_SCRIPT_PATH} must exist). ' + f'Install it via devtools/install_gcn.sh (see {self.url}). ' + f'Skipping GCN TS guesses for {self.reactions[0].label}.') + return self._log_job_execution() rxn = self.reactions[0] if not rxn.is_isomerization(): @@ -370,12 +385,13 @@ def run_subprocess_locally(direction: str, # leak into the child (see arc/job/env_run.py). output = run_in_conda_env( TS_GCN_PYTHON, GCN_SCRIPT_PATH, - '--r_sdf_path', product_path, - '--p_sdf_path', reactant_path, + '--r_sdf_path', reactant_path, + '--p_sdf_path', product_path, '--ts_xyz_path', ts_path, ) if output.returncode: - logger.warning(f'GCN subprocess ran in the reverse direction did not ' + direction_str = 'forward' if direction == 'F' else 'reverse' + logger.warning(f'GCN subprocess run in the {direction_str} direction did not ' f'give a successful return code for {ts_species}.\n' f'Got return code: {output.returncode}\n' f'stdout: {output.stdout}\n' diff --git a/arc/job/adapters/ts/heuristics.py b/arc/job/adapters/ts/heuristics.py index 805ea97067..05a1e234fb 100644 --- a/arc/job/adapters/ts/heuristics.py +++ b/arc/job/adapters/ts/heuristics.py @@ -21,28 +21,44 @@ import os from typing import TYPE_CHECKING, Any -from arc.common import (ARC_PATH, almost_equal_coords, get_angle_in_180_range, get_logger, is_angle_linear, - is_xyz_linear, key_by_val, read_yaml_file) +from arc.common import ( + ARC_PATH, + almost_equal_coords, + get_angle_in_180_range, + get_logger, + is_angle_linear, + is_xyz_linear, + key_by_val, + read_yaml_file, +) from arc.family import get_reaction_family_products from arc.job.adapter import JobAdapter from arc.job.adapters.common import _initialize_adapter, ts_adapters_by_rmg_family from arc.job.factory import register_job_adapter from arc.plotter import save_geo -from arc.species.converter import (compare_zmats, relocate_zmat_dummy_atoms_to_the_end, zmat_from_xyz, zmat_to_xyz, - add_atom_to_xyz_using_internal_coords, sorted_distances_of_atom) +from arc.species.converter import ( + add_atom_to_xyz_using_internal_coords, + compare_zmats, + relocate_zmat_dummy_atoms_to_the_end, + sorted_distances_of_atom, + zmat_from_xyz, + zmat_to_xyz, +) from arc.mapping.engine import map_two_species from arc.molecule.molecule import Molecule from arc.species.species import ARCSpecies, TSGuess, SpeciesError, colliding_atoms from arc.species.zmat import get_parameter_from_atom_indices, remove_zmat_atom_0, up_param, xyz_to_zmat from arc.species.vectors import calculate_angle +from arc.job.adapters.ts.seed_hub import get_ts_seeds if TYPE_CHECKING: from arc.level import Level from arc.reaction import ARCReaction - -FAMILY_SETS = {'hydrolysis_set_1': ['carbonyl_based_hydrolysis', 'ether_hydrolysis'], - 'hydrolysis_set_2': ['nitrile_hydrolysis']} +FAMILY_SETS = { + 'hydrolysis_set_1': ['carbonyl_based_hydrolysis', 'ether_hydrolysis'], + 'hydrolysis_set_2': ['nitrile_hydrolysis'], +} DIHEDRAL_INCREMENT = 30 @@ -258,56 +274,60 @@ def execute_incore(self): multiplicity=rxn.multiplicity, ) - xyzs = list() - tsg, families = None, None - if rxn.family == 'H_Abstraction': - tsg = TSGuess(method='Heuristics') - tsg.tic() - xyzs = h_abstraction(reaction=rxn, dihedral_increment=self.dihedral_increment) - tsg.tok() - + tsg = TSGuess(method='Heuristics') + tsg.tic() + xyzs = get_ts_seeds( + reaction=rxn, + base_adapter='heuristics', + dihedral_increment=self.dihedral_increment, + ) + tsg.tok() if rxn.family in FAMILY_SETS['hydrolysis_set_1'] or rxn.family in FAMILY_SETS['hydrolysis_set_2']: - try: - tsg = TSGuess(method='Heuristics') - tsg.tic() - xyzs, families, indices = hydrolysis(reaction=rxn) - tsg.tok() - if not xyzs: - logger.warning(f'Heuristics TS search failed to generate any valid TS guesses for {rxn.label}.') - continue - except ValueError: + if not xyzs: + logger.warning( + f'Heuristics TS search failed to generate any valid TS guesses for {rxn.label}.' + ) continue - for method_index, xyz in enumerate(xyzs): + for method_index, xyz_entry in enumerate(xyzs): + xyz = xyz_entry.get("xyz") + method_label = xyz_entry.get("method", "Heuristics") + family = xyz_entry.get("family", rxn.family) + if xyz is None: + continue unique = True for other_tsg in rxn.ts_species.ts_guesses: if almost_equal_coords(xyz, other_tsg.initial_xyz): - if 'heuristics' not in other_tsg.method.lower(): - other_tsg.method += ' and Heuristics' + existing_sources = getattr(other_tsg, "method_sources", None) + if existing_sources is not None: + combined_sources = list(existing_sources) + [method_label] + else: + combined_sources = [other_tsg.method, method_label] + other_tsg.method_sources = TSGuess._normalize_method_sources(combined_sources) unique = False break if unique: - ts_guess = TSGuess(method='Heuristics', + ts_guess = TSGuess(method=method_label, index=len(rxn.ts_species.ts_guesses), method_index=method_index, t0=tsg.t0, execution_time=tsg.execution_time, success=True, - family=rxn.family if families is None else families[method_index], + family=family, xyz=xyz, ) rxn.ts_species.ts_guesses.append(ts_guess) save_geo(xyz=xyz, path=self.local_path, - filename=f'Heuristics_{method_index}', + filename=f'{method_label}_{method_index}', format_='xyz', - comment=f'Heuristics {method_index}, family: {rxn.family}', + comment=f'{method_label} {method_index}, family: {rxn.family}', ) if len(self.reactions) < 5: - successes = len([tsg for tsg in rxn.ts_species.ts_guesses if tsg.success and 'heuristics' in tsg.method]) + successes = [tsg for tsg in rxn.ts_species.ts_guesses if tsg.success] if successes: - logger.info(f'Heuristics successfully found {successes} TS guesses for {rxn.label}.') + logger.info(f'Heuristics successfully found {len(successes)} TS guesses for {rxn.label}.') else: logger.info(f'Heuristics did not find any successful TS guesses for {rxn.label}.') @@ -872,7 +892,7 @@ def h_abstraction(reaction: ARCReaction, dihedral_increment (int, optional): The dihedral increment to use for B-H-A-C and D-B-H-C dihedral scans. Returns: list[dict] - Entries are Cartesian coordinates of TS guesses for all reactions. + Entries hold Cartesian coordinates of TS guesses and the generating method label. """ xyz_guesses = list() dihedral_increment = dihedral_increment or DIHEDRAL_INCREMENT @@ -951,7 +971,8 @@ def h_abstraction(reaction: ARCReaction, else: # This TS is unique, and has no atom collisions. zmats.append(zmat_guess) - xyz_guesses.append(xyz_guess) + xyz_guesses.append({"xyz": xyz_guess, "method": "Heuristics"}) + return xyz_guesses @@ -986,9 +1007,11 @@ def hydrolysis(reaction: ARCReaction) -> tuple[list[dict], list[dict], list[int] is_set_1 = reaction_family in hydrolysis_parameters["family_sets"]["set_1"] is_set_2 = reaction_family in hydrolysis_parameters["family_sets"]["set_2"] - main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices(reaction, - product_dict, - is_set_1) + main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices( + reaction, + product_dict, + is_set_1, + ) base_xyz_indices = { "a": xyz_indices["a"], "b": xyz_indices["b"], @@ -998,9 +1021,19 @@ def hydrolysis(reaction: ARCReaction) -> tuple[list[dict], list[dict], list[int] } adjustments_to_try = [False, True] if dihedrals_to_change_num == 1 else [True] for adjust_dihedral in adjustments_to_try: - chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices(initial_xyz, base_xyz_indices, xyz_indices, - hydrolysis_parameters,reaction_family, water, zmats_total, is_set_1, is_set_2, - dihedrals_to_change_num, should_adjust_dihedral=adjust_dihedral) + chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices( + initial_xyz, + base_xyz_indices, + xyz_indices, + hydrolysis_parameters, + reaction_family, + water, + zmats_total, + is_set_1, + is_set_2, + dihedrals_to_change_num, + should_adjust_dihedral=adjust_dihedral, + ) max_dihedrals_found = max(max_dihedrals_found, n_dihedrals_found) if xyz_guesses: xyz_guesses_total.extend(xyz_guesses) @@ -1014,8 +1047,8 @@ def hydrolysis(reaction: ARCReaction) -> tuple[list[dict], list[dict], list[int] condition_met = len(xyz_guesses_total) > 0 nitrile_in_inputs = any( - (pd.get("family") == "nitrile_hydrolysis") or - (isinstance(pd.get("family"), list) and "nitrile_hydrolysis" in pd.get("family")) + (pd.get("family") == "nitrile_hydrolysis") + or (isinstance(pd.get("family"), list) and "nitrile_hydrolysis" in pd.get("family")) for pd in product_dicts ) nitrile_already_found = any(fam == "nitrile_hydrolysis" for fam in reaction_families) @@ -1031,9 +1064,11 @@ def hydrolysis(reaction: ARCReaction) -> tuple[list[dict], list[dict], list[int] is_set_1 = reaction_family in hydrolysis_parameters["family_sets"]["set_1"] is_set_2 = reaction_family in hydrolysis_parameters["family_sets"]["set_2"] - main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices(reaction, - product_dict, - is_set_1) + main_reactant, water, initial_xyz, xyz_indices = extract_reactant_and_indices( + reaction, + product_dict, + is_set_1, + ) base_xyz_indices = { "a": xyz_indices["a"], "b": xyz_indices["b"], @@ -1047,10 +1082,18 @@ def hydrolysis(reaction: ARCReaction) -> tuple[list[dict], list[dict], list[int] break dihedrals_to_change_num += 1 chosen_xyz_indices, xyz_guesses, zmats_total, n_dihedrals_found = process_chosen_d_indices( - initial_xyz, base_xyz_indices, xyz_indices, - hydrolysis_parameters, reaction_family, water, zmats_total, is_set_1, is_set_2, - dihedrals_to_change_num, should_adjust_dihedral=True, - allow_nitrile_dihedrals=True + initial_xyz, + base_xyz_indices, + xyz_indices, + hydrolysis_parameters, + reaction_family, + water, + zmats_total, + is_set_1, + is_set_2, + dihedrals_to_change_num, + should_adjust_dihedral=True, + allow_nitrile_dihedrals=True, ) max_dihedrals_found = max(max_dihedrals_found, n_dihedrals_found) @@ -1082,11 +1125,13 @@ def get_products_and_check_families(reaction: ARCReaction) -> tuple[list[dict], consider_arc_families=True, ) carbonyl_based_present = any( - "carbonyl_based_hydrolysis" in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) + "carbonyl_based_hydrolysis" + in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) for d in product_dicts ) ether_present = any( - "ether_hydrolysis" in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) + "ether_hydrolysis" + in (d.get("family", []) if isinstance(d.get("family"), list) else [d.get("family")]) for d in product_dicts ) @@ -1162,11 +1207,13 @@ def extract_reactant_and_indices(reaction: ARCReaction, main_reactant, a_xyz_index, b_xyz_index, - two_neighbors + two_neighbors, ) except ValueError as e: - raise ValueError(f"Failed to determine neighbors by electronegativity for atom {a_xyz_index} " - f"in species {main_reactant.label}: {e}") + raise ValueError( + f"Failed to determine neighbors by electronegativity for atom {a_xyz_index} " + f"in species {main_reactant.label}: {e}" + ) o_index = len(main_reactant.mol.atoms) h1_index = o_index + 1 @@ -1177,7 +1224,7 @@ def extract_reactant_and_indices(reaction: ARCReaction, "e": e_xyz_index, "d": d_xyz_indices, "o": o_index, - "h1": h1_index + "h1": h1_index, } return main_reactant, water, initial_xyz, xyz_indices @@ -1222,11 +1269,18 @@ def process_chosen_d_indices(initial_xyz: dict, """ max_dihedrals_found = 0 for d_index in xyz_indices.get("d", []) or [None]: - chosen_xyz_indices = {**base_xyz_indices, "d": d_index} if d_index is not None else {**base_xyz_indices, - "d": None} + chosen_xyz_indices = {**base_xyz_indices, "d": d_index} if d_index is not None else { + **base_xyz_indices, + "d": None, + } current_zmat, zmat_indices = setup_zmat_indices(initial_xyz, chosen_xyz_indices) - matches = get_matching_dihedrals(current_zmat, zmat_indices['a'], zmat_indices['b'], - zmat_indices['e'], zmat_indices['d']) + matches = get_matching_dihedrals( + current_zmat, + zmat_indices['a'], + zmat_indices['b'], + zmat_indices['e'], + zmat_indices['d'], + ) max_dihedrals_found = max(max_dihedrals_found, len(matches)) if should_adjust_dihedral and dihedrals_to_change_num > len(matches): continue @@ -1244,22 +1298,28 @@ def process_chosen_d_indices(initial_xyz: dict, zmat_variants = generate_dihedral_variants(current_zmat, indices, adjustment_factors) if zmat_variants: adjusted_zmats.extend(zmat_variants) - if not adjusted_zmats: - pass - else: + if adjusted_zmats: zmats_to_process = adjusted_zmats ts_guesses_list = [] for zmat_to_process in zmats_to_process: ts_guesses, updated_zmats = process_family_specific_adjustments( - is_set_1, is_set_2, reaction_family, hydrolysis_parameters, - zmat_to_process, water, chosen_xyz_indices, zmats_total) + is_set_1, + is_set_2, + reaction_family, + hydrolysis_parameters, + zmat_to_process, + water, + chosen_xyz_indices, + zmats_total, + ) zmats_total = updated_zmats ts_guesses_list.extend(ts_guesses) if attempted_dihedral_adjustments and not ts_guesses_list and ( - reaction_family != 'nitrile_hydrolysis' or allow_nitrile_dihedrals): - flipped_zmats= [] + reaction_family != 'nitrile_hydrolysis' or allow_nitrile_dihedrals + ): + flipped_zmats = [] adjustment_factors = [15, 25, 35, 45, 55] for indices in indices_list: flipped_variants = generate_dihedral_variants(current_zmat, indices, adjustment_factors, flip=True) @@ -1267,8 +1327,14 @@ def process_chosen_d_indices(initial_xyz: dict, for zmat_to_process in flipped_zmats: ts_guesses, updated_zmats = process_family_specific_adjustments( - is_set_1, is_set_2, reaction_family, hydrolysis_parameters, - zmat_to_process, water, chosen_xyz_indices, zmats_total + is_set_1, + is_set_2, + reaction_family, + hydrolysis_parameters, + zmat_to_process, + water, + chosen_xyz_indices, + zmats_total, ) zmats_total = updated_zmats ts_guesses_list.extend(ts_guesses) @@ -1338,8 +1404,11 @@ def get_neighbors_by_electronegativity(spc: ARCSpecies, Raises: ValueError: If the atom has no valid neighbors. """ - neighbors = [neighbor for neighbor in spc.mol.atoms[atom_index].edges.keys() - if spc.mol.atoms.index(neighbor) != exclude_index] + neighbors = [ + neighbor + for neighbor in spc.mol.atoms[atom_index].edges.keys() + if spc.mol.atoms.index(neighbor) != exclude_index + ] if not neighbors: raise ValueError(f"Atom at index {atom_index} has no valid neighbors.") @@ -1353,12 +1422,17 @@ def get_neighbor_total_electronegativity(neighbor: 'Atom') -> float: float: The total electronegativity of the neighbor """ return sum( - ELECTRONEGATIVITIES[n.symbol] * neighbor.edges[n].order - for n in neighbor.edges.keys() + ELECTRONEGATIVITIES[n.symbol] * neighbor.edges[n].order for n in neighbor.edges.keys() ) - effective_electronegativities = [(ELECTRONEGATIVITIES[n.symbol] * spc.mol.atoms[atom_index].edges[n].order, - get_neighbor_total_electronegativity(n), n ) for n in neighbors] + effective_electronegativities = [ + ( + ELECTRONEGATIVITIES[n.symbol] * spc.mol.atoms[atom_index].edges[n].order, + get_neighbor_total_electronegativity(n), + n, + ) + for n in neighbors + ] effective_electronegativities.sort(reverse=True, key=lambda x: (x[0], x[1])) sorted_neighbors = [spc.mol.atoms.index(n[2]) for n in effective_electronegativities] most_electronegative = sorted_neighbors[0] @@ -1385,7 +1459,7 @@ def setup_zmat_indices(initial_xyz: dict, 'a': key_by_val(initial_zmat.get('map', {}), xyz_indices['a']), 'b': key_by_val(initial_zmat.get('map', {}), xyz_indices['b']), 'e': key_by_val(initial_zmat.get('map', {}), xyz_indices['e']), - 'd': key_by_val(initial_zmat.get('map', {}), xyz_indices['d']) if xyz_indices['d'] is not None else None + 'd': key_by_val(initial_zmat.get('map', {}), xyz_indices['d']) if xyz_indices['d'] is not None else None, } return initial_zmat, zmat_indices @@ -1396,15 +1470,15 @@ def generate_dihedral_variants(zmat: dict, flip: bool = False, tolerance_degrees: float = 10.0) -> list[dict]: """ - Create variants of a Z-matrix by adjusting dihedral angles using multiple adjustment factors. + Create variants of a Z-matrix by adjusting dihedral angles using multiple adjustment factors. This function creates variants of the Z-matrix using different adjustment factors: - 1. Retrieve the current dihedral value and normalize it to the (-180°, 180°] range. - 2. For each adjustment factor, slightly push the angle away from 0° or ±180° to avoid - unstable, boundary configurations. - 3. If `flip=True`, the same procedure is applied starting from a flipped - (180°-shifted) baseline angle. - 4. Each adjusted or flipped variant is deep-copied to ensure independence. + 1. Retrieve the current dihedral value and normalize it to the (-180°, 180°] range. + 2. For each adjustment factor, slightly push the angle away from 0° or ±180° to avoid + unstable, boundary configurations. + 3. If `flip=True`, the same procedure is applied starting from a flipped + (180°-shifted) baseline angle. + 4. Each adjusted or flipped variant is deep-copied to ensure independence. Args: zmat (dict): The initial Z-matrix. @@ -1412,7 +1486,8 @@ def generate_dihedral_variants(zmat: dict, adjustment_factors (list[float], optional): List of factors to try. flip (bool, optional): Whether to start from a flipped (180°) baseline dihedral angle. Defaults to False. - tolerance_degrees (float, optional): Tolerance (in degrees) for detecting angles near 0° or ±180°. Defaults to 10.0. + tolerance_degrees (float, optional): Tolerance (in degrees) for detecting angles near 0° or ±180°. + Defaults to 10.0. Returns: list[dict]: List of Z-matrix variants with adjusted dihedral angles. @@ -1438,8 +1513,9 @@ def push_up_dihedral(val: float, adj_factor: float) -> float: seed_value = normalized_value if flip: seed_value = get_angle_in_180_range(normalized_value + 180.0) - boundary_like = ((abs(seed_value) < tolerance_degrees) - or (180 - tolerance_degrees <= abs(seed_value) <= 180+tolerance_degrees)) + boundary_like = (abs(seed_value) < tolerance_degrees) or ( + 180 - tolerance_degrees <= abs(seed_value) <= 180 + tolerance_degrees + ) if boundary_like: for factor in adjustment_factors: variant = copy.deepcopy(zmat) @@ -1482,11 +1558,13 @@ def get_matching_dihedrals(zmat: dict, return matches -def stretch_ab_bond(initial_zmat: 'dict', - xyz_indices: 'dict', - zmat_indices: 'dict', - hydrolysis_parameters: 'dict', - reaction_family: str) -> None: +def stretch_ab_bond( + initial_zmat: dict, + xyz_indices: dict, + zmat_indices: dict, + hydrolysis_parameters: dict, + reaction_family: str, +) -> None: """ Stretch the bond between atoms a and b in the Z-matrix based on the reaction family parameters. @@ -1526,7 +1604,7 @@ def process_family_specific_adjustments(is_set_1: bool, xyz_indices: dict, zmats_total: list[dict]) -> tuple[list[dict], list[dict]]: """ - Process specific adjustments for different hydrolysis reaction families if needed, then generate TS guesses . + Process specific adjustments for different hydrolysis reaction families if needed, then generate TS guesses. Args: is_set_1 (bool): Whether the reaction belongs to parameter set 1. @@ -1544,21 +1622,34 @@ def process_family_specific_adjustments(is_set_1: bool, Raises: ValueError: If the reaction family is not supported. """ - a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz= xyz_indices.values() + a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz = xyz_indices.values() r_atoms = [a_xyz, o_xyz, o_xyz] a_atoms = [[b_xyz, a_xyz], [a_xyz, o_xyz], [h1_xyz, o_xyz]] - d_atoms = ([[e_xyz, d_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] - if d_xyz is not None else - [[e_xyz, b_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]]) + d_atoms = ( + [[e_xyz, d_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] + if d_xyz is not None + else [[e_xyz, b_xyz, a_xyz], [b_xyz, a_xyz, o_xyz], [a_xyz, h1_xyz, o_xyz]] + ) r_value = hydrolysis_parameters['family_parameters'][str(reaction_family)]['r_value'] a_value = hydrolysis_parameters['family_parameters'][str(reaction_family)]['a_value'] d_values = hydrolysis_parameters['family_parameters'][str(reaction_family)]['d_values'] if is_set_1 or is_set_2: initial_xyz = zmat_to_xyz(initial_zmat) - return generate_hydrolysis_ts_guess(initial_xyz, xyz_indices.values(), water, r_atoms, a_atoms, d_atoms, - r_value, a_value, d_values, zmats_total, is_set_1, - threshold=0.6 if reaction_family == 'nitrile_hydrolysis' else 0.8) + return generate_hydrolysis_ts_guess( + initial_xyz, + xyz_indices.values(), + water, + r_atoms, + a_atoms, + d_atoms, + r_value, + a_value, + d_values, + zmats_total, + is_set_1, + threshold=0.6 if reaction_family == 'nitrile_hydrolysis' else 0.8, + ) else: raise ValueError(f"Family {reaction_family} not supported for hydrolysis TS guess generation.") @@ -1598,7 +1689,7 @@ def generate_hydrolysis_ts_guess(initial_xyz: dict, """ xyz_guesses = [] - for index, d_value in enumerate(d_values): + for d_value in d_values: xyz_guess = copy.deepcopy(initial_xyz) for i in range(3): xyz_guess = add_atom_to_xyz_using_internal_coords( @@ -1609,18 +1700,18 @@ def generate_hydrolysis_ts_guess(initial_xyz: dict, d_indices=d_atoms[i], r_value=r_value[i], a_value=a_value[i], - d_value=d_value[i] + d_value=d_value[i], ) - a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz= xyz_indices - are_valid_bonds=check_ts_bonds(xyz_guess, [o_xyz, h1_xyz, h1_xyz+1, a_xyz, b_xyz]) - colliding=colliding_atoms(xyz_guess, threshold=threshold) + a_xyz, b_xyz, e_xyz, o_xyz, h1_xyz, d_xyz = xyz_indices + are_valid_bonds = check_ts_bonds(xyz_guess, [o_xyz, h1_xyz, h1_xyz + 1, a_xyz, b_xyz]) + colliding = colliding_atoms(xyz_guess, threshold=threshold) duplicate = any(compare_zmats(existing, xyz_to_zmat(xyz_guess)) for existing in zmats_total) if is_set_1: - dihedral_edao=[e_xyz, d_xyz, a_xyz, o_xyz] - dao_is_linear=check_dao_angle(dihedral_edao, xyz_guess) + dihedral_edao = [e_xyz, d_xyz, a_xyz, o_xyz] + dao_is_linear = check_dao_angle(dihedral_edao, xyz_guess) else: - dao_is_linear=False + dao_is_linear = False if xyz_guess is not None and not colliding and not duplicate and are_valid_bonds and not dao_is_linear: xyz_guesses.append(xyz_guess) zmats_total.append(xyz_to_zmat(xyz_guess)) @@ -1641,7 +1732,7 @@ def check_dao_angle(d_indices: list[int], xyz_guess: dict) -> bool: """ angle_indices = [d_indices[1], d_indices[2], d_indices[3]] angle_value = calculate_angle(xyz_guess, angle_indices) - norm_value=(angle_value + 180) % 180 + norm_value = (angle_value + 180) % 180 return (norm_value < 10) or (norm_value > 170) @@ -1656,7 +1747,7 @@ def check_ts_bonds(transition_state_xyz: dict, tested_atom_indices: list) -> boo Returns: bool: Whether the transition state guess has the expected water-related bonds. """ - oxygen_index, h1_index, h2_index, a_index, b_index= tested_atom_indices + oxygen_index, h1_index, h2_index, a_index, b_index = tested_atom_indices oxygen_bonds = sorted_distances_of_atom(transition_state_xyz, oxygen_index) h1_bonds = sorted_distances_of_atom(transition_state_xyz, h1_index) h2_bonds = sorted_distances_of_atom(transition_state_xyz, h2_index) @@ -1675,10 +1766,12 @@ def check_oxygen_bonds(bonds): return rel_error <= 0.1 return False - oxygen_has_valid_bonds = (oxygen_bonds[0][0] == h2_index and check_oxygen_bonds(oxygen_bonds)) - h1_has_valid_bonds = (h1_bonds[0][0] in {oxygen_index, b_index}and h1_bonds[1][0] in {oxygen_index, b_index}) + oxygen_has_valid_bonds = oxygen_bonds[0][0] == h2_index and check_oxygen_bonds(oxygen_bonds) + h1_has_valid_bonds = (h1_bonds[0][0] in {oxygen_index, b_index}) and ( + h1_bonds[1][0] in {oxygen_index, b_index} + ) h2_has_valid_bonds = h2_bonds[0][0] == oxygen_index return oxygen_has_valid_bonds and h1_has_valid_bonds and h2_has_valid_bonds -register_job_adapter('heuristics', HeuristicsAdapter) +register_job_adapter("heuristics", HeuristicsAdapter) diff --git a/arc/job/adapters/ts/heuristics_test.py b/arc/job/adapters/ts/heuristics_test.py index d49566a29a..032b668b66 100644 --- a/arc/job/adapters/ts/heuristics_test.py +++ b/arc/job/adapters/ts/heuristics_test.py @@ -10,6 +10,8 @@ import os import shutil import unittest +from types import SimpleNamespace +from unittest.mock import patch from arc.common import ARC_TESTING_PATH, almost_equal_coords from arc.family import get_reaction_family_products @@ -31,6 +33,7 @@ check_dao_angle, check_ts_bonds, ) +from arc.job.adapters.ts.seed_hub import get_ts_seeds, get_wrapper_constraints from arc.reaction import ARCReaction from arc.species.converter import str_to_xyz, zmat_to_xyz, zmat_from_xyz from arc.species.species import ARCSpecies @@ -2253,5 +2256,61 @@ def tearDownClass(cls): shutil.rmtree(os.path.join(ARC_TESTING_PATH, sub), ignore_errors=True) +class TestHeuristicsHub(unittest.TestCase): + """Unit tests for shared heuristic seed and CREST-constraint helpers.""" + + def test_get_ts_seeds_h_abstraction(self): + rxn = SimpleNamespace(family='H_Abstraction') + with patch('arc.job.adapters.ts.heuristics.h_abstraction', + return_value=[{'xyz': {'symbols': ('H',), 'coords': ((0.0, 0.0, 0.0),), 'isotopes': (1,)}, + 'method': 'Heuristics'}]): + seeds = get_ts_seeds(reaction=rxn, base_adapter='heuristics', dihedral_increment=60) + self.assertEqual(len(seeds), 1) + self.assertEqual(seeds[0]['family'], 'H_Abstraction') + self.assertEqual(seeds[0]['method'], 'Heuristics') + self.assertEqual(seeds[0]['source_adapter'], 'heuristics') + + def test_get_ts_seeds_hydrolysis(self): + rxn = SimpleNamespace(family='carbonyl_based_hydrolysis') + xyz = {'symbols': ('O',), 'coords': ((0.0, 0.0, 0.0),), 'isotopes': (16,)} + with patch('arc.job.adapters.ts.heuristics.hydrolysis', + return_value=([xyz], ['carbonyl_based_hydrolysis'], [[0, 1, 2]])): + seeds = get_ts_seeds(reaction=rxn, base_adapter='heuristics') + self.assertEqual(len(seeds), 1) + self.assertEqual(seeds[0]['family'], 'carbonyl_based_hydrolysis') + self.assertEqual(seeds[0]['xyz'], xyz) + self.assertEqual(seeds[0]['metadata'], {'indices': [0, 1, 2]}) + + def test_get_wrapper_constraints_crest(self): + rxn = SimpleNamespace(family='H_Abstraction') + xyz = str_to_xyz("""O 0.0000 0.0000 0.0000 + H 0.0000 0.0000 0.9600 + H 0.9000 0.0000 0.0000""") + seed = {'xyz': xyz, 'family': rxn.family} + atoms = get_wrapper_constraints(wrapper='crest', reaction=rxn, seed=seed) + self.assertIsInstance(atoms, dict) + self.assertSetEqual(set(atoms.keys()), {'A', 'H', 'B'}) + self.assertTrue(all(isinstance(v, int) for v in atoms.values())) + + def test_get_wrapper_constraints_crest_unsupported_family(self): + rxn = SimpleNamespace(family='carbonyl_based_hydrolysis') + xyz = str_to_xyz("""O 0.0000 0.0000 0.0000 + H 0.0000 0.0000 0.9600 + H 0.9000 0.0000 0.0000""") + seed = {'xyz': xyz, 'family': rxn.family} + atoms = get_wrapper_constraints(wrapper='crest', reaction=rxn, seed=seed) + self.assertIsNone(atoms) + + def test_get_ts_seeds_unsupported_adapter(self): + rxn = SimpleNamespace(family='H_Abstraction') + with self.assertRaises(ValueError): + get_ts_seeds(reaction=rxn, base_adapter='gcn') + + def test_get_wrapper_constraints_unsupported_wrapper(self): + rxn = SimpleNamespace(family='H_Abstraction') + with self.assertRaises(ValueError): + get_wrapper_constraints(wrapper='foo_wrapper', reaction=rxn, seed={}) + + if __name__ == '__main__': unittest.main(testRunner=unittest.TextTestRunner(verbosity=2)) diff --git a/arc/job/adapters/ts/kinbot_test.py b/arc/job/adapters/ts/kinbot_test.py index e61ef4cf88..a6327e42c0 100644 --- a/arc/job/adapters/ts/kinbot_test.py +++ b/arc/job/adapters/ts/kinbot_test.py @@ -2,22 +2,34 @@ # encoding: utf-8 """ -This module contains unit tests of the arc.job.adapters.ts.autotst module +This module contains unit tests of the arc.job.adapters.ts.kinbot_ts module """ +import math import os import shutil +import subprocess import unittest +from unittest import mock -from arc.common import ARC_TESTING_PATH -from arc.job.adapters.ts.kinbot_ts import KinBotAdapter, HAS_KINBOT +from arc.common import ARC_TESTING_PATH, read_yaml_file, save_yaml_file +import arc.job.adapters.ts.kinbot_ts as kinbot_ts +from arc.job.adapters.ts.kinbot_ts import KinBotAdapter from arc.reaction import ARCReaction from arc.species import ARCSpecies +def kinbot_list_to_coords(structure: list) -> list: + """A helper function to convert a flat KinBot structure list into an N x 3 coordinates list.""" + coords = list() + for i in range(0, len(structure), 4): + coords.append([float(structure[i + 1]), float(structure[i + 2]), float(structure[i + 3])]) + return coords + + class TestKinBotAdapter(unittest.TestCase): """ - Contains unit tests for the AutoTSTAdapter class. + Contains unit tests for the KinBotAdapter class. """ @classmethod @@ -26,48 +38,220 @@ def setUpClass(cls): A method that is run before all unit tests in this class. """ cls.maxDiff = None + cls.project_dir = os.path.join(ARC_TESTING_PATH, 'test_KinBot') + + def setUp(self): + """ + A method that is run before each unit test in this class. + """ + self.rxn_1 = ARCReaction(reactants=['CC[O]'], products=['[CH2]CO'], + r_species=[ARCSpecies(label='CC[O]', smiles='CC[O]')], + p_species=[ARCSpecies(label='[CH2]CO', smiles='[CH2]CO')]) + + def _remove_test_dir(self, path: str): + """A helper function to remove a single test's project directory (and the shared + parent directory if it is empty). Tests may run in parallel (pytest-xdist), so + each test must only ever remove its own subdirectory.""" + shutil.rmtree(path, ignore_errors=True) + try: + os.rmdir(self.project_dir) + except OSError: + pass + + def get_adapter(self, dir_name: str) -> KinBotAdapter: + """A helper function to instantiate a KinBotAdapter instance.""" + project_directory = os.path.join(self.project_dir, dir_name) + self.addCleanup(self._remove_test_dir, project_directory) + return KinBotAdapter(job_type='tsg', + reactions=[self.rxn_1], + testing=True, + project='test', + project_directory=project_directory, + ) + + def test_family_map(self): + """Test that the KinBot family map supports the expected RMG families.""" + self.assertEqual(self.rxn_1.family, 'intra_H_migration') + adapter = self.get_adapter(dir_name='tst_family_map') + self.assertIn('intra_H_migration', adapter.supported_families) + self.assertEqual(adapter.family_map['intra_H_migration'], + ['intra_H_migration', 'intra_H_migration_suprafacial']) + self.assertEqual(adapter.family_map['R_Addition_COm'], ['R_Addition_COm3_R']) + + def test_missing_kinbot_python(self): + """Test that execute_incore() raises if the kinbot_env python executable is missing.""" + adapter = self.get_adapter(dir_name='tst_missing_python') + with mock.patch.object(kinbot_ts, 'KINBOT_PYTHON', None): + with self.assertRaises(FileNotFoundError): + adapter.execute_incore() - # @pytest.mark.skip(reason="KinBot support in ARC has been deprecated") def test_intra_h_migration(self): - """Test KinBot for intra H migration reactions""" - if HAS_KINBOT: - rxn1 = ARCReaction(reactants=['CC[O]'], products=['[CH2]CO'], - r_species=[ARCSpecies(label='CC[O]', smiles='CC[O]')], - p_species=[ARCSpecies(label='[CH2]CO', smiles='[CH2]CO')]) - self.assertEqual(rxn1.family, 'intra_H_migration') - kinbot1 = KinBotAdapter(job_type='tsg', - reactions=[rxn1], - testing=True, - project='test', - project_directory=os.path.join(ARC_TESTING_PATH, 'test_KinBot', 'tst1'), - ) - kinbot1.execute_incore() - self.assertTrue(rxn1.ts_species.is_ts) - self.assertEqual(rxn1.ts_species.charge, 0) - self.assertEqual(rxn1.ts_species.multiplicity, 2) - self.assertEqual(len(rxn1.ts_species.ts_guesses), 2) - self.assertEqual(rxn1.ts_species.ts_guesses[0].initial_xyz['symbols'], - ('C', 'C', 'O', 'H', 'H', 'H', 'H', 'H')) - self.assertEqual(rxn1.ts_species.ts_guesses[1].initial_xyz['symbols'], - ('C', 'C', 'O', 'H', 'H', 'H', 'H', 'H')) - self.assertEqual(len(rxn1.ts_species.ts_guesses[1].initial_xyz['coords']), 8) - self.assertEqual(rxn1.ts_species.ts_guesses[0].method, 'kinbot') - self.assertEqual(rxn1.ts_species.ts_guesses[1].method, 'kinbot') - self.assertEqual(rxn1.ts_species.ts_guesses[0].method_index, 0) - self.assertEqual(rxn1.ts_species.ts_guesses[1].method_index, 1) - self.assertEqual(rxn1.ts_species.ts_guesses[0].method_direction, 'F') - self.assertEqual(rxn1.ts_species.ts_guesses[1].method_direction, 'R') - self.assertLess(rxn1.ts_species.ts_guesses[0].execution_time.seconds, 300) # 0:00:00.003082 - self.assertTrue(rxn1.ts_species.ts_guesses[0].success) - self.assertTrue(rxn1.ts_species.ts_guesses[1].success) + """Test KinBot for intra H migration reactions with a mocked subprocess boundary.""" + self.assertEqual(self.rxn_1.family, 'intra_H_migration') + adapter = self.get_adapter(dir_name='tst1') + captured = dict() - @classmethod - def tearDownClass(cls): + def fake_run_in_conda_env(python_executable, script_path, *script_args, check=False): + """Mimic the KinBot worker script: read the input file, write an output file.""" + captured['python_executable'] = python_executable + captured['script_path'] = script_path + captured['script_args'] = script_args + self.assertEqual(script_args[0], '--yml_in_path') + input_dict = read_yaml_file(path=script_args[1]) + captured['input_dict'] = input_dict + results = list() + for well in input_dict['wells']: + # Return the (valid, non-colliding) well geometry itself as the "TS guess". + results.append({'direction': well['direction'], + 'success': True, + 'coords': kinbot_list_to_coords(well['structure']), + 'execution_time': '0:00:00.104819', + 'uma_refined': False, + }) + save_yaml_file(path=input_dict['yml_out_path'], content=results) + return subprocess.CompletedProcess(args=[], returncode=0, stdout='', stderr='') + + with mock.patch.object(kinbot_ts, 'KINBOT_PYTHON', kinbot_ts.__file__), \ + mock.patch.object(kinbot_ts, 'run_in_conda_env', side_effect=fake_run_in_conda_env) as run_mock: + adapter.execute_incore() + + # Assert the boundary was crossed correctly. + self.assertEqual(run_mock.call_count, 1) + self.assertEqual(captured['python_executable'], kinbot_ts.__file__) + self.assertEqual(captured['script_path'], kinbot_ts.KINBOT_SCRIPT_PATH) + self.assertTrue(captured['script_path'].endswith(os.path.join('scripts', 'kinbot_script.py'))) + + # Assert the input file contents. + input_dict = captured['input_dict'] + self.assertEqual(input_dict['charge'], 0) + self.assertEqual(input_dict['multiplicity'], 2) + self.assertEqual(input_dict['families'], ['intra_H_migration', 'intra_H_migration_suprafacial']) + self.assertIn('uma', input_dict) + self.assertFalse(input_dict['uma']['refine']) # UMA refinement is opt-in, off by default. + self.assertEqual(input_dict['yml_out_path'], adapter.yml_out_path) + directions = [well['direction'] for well in input_dict['wells']] + self.assertIn('F', directions) + self.assertIn('R', directions) + for well in input_dict['wells']: + self.assertIsInstance(well['smiles'], str) + self.assertEqual(len(well['structure']), 4 * 8) # 8 atoms, 4 entries per atom. + + # Assert the output parsing. + self.assertTrue(self.rxn_1.ts_species.is_ts) + self.assertEqual(self.rxn_1.ts_species.charge, 0) + self.assertEqual(self.rxn_1.ts_species.multiplicity, 2) + self.assertEqual(len(self.rxn_1.ts_species.ts_guesses), 2) + self.assertEqual(self.rxn_1.ts_species.ts_guesses[0].initial_xyz['symbols'], + ('C', 'C', 'O', 'H', 'H', 'H', 'H', 'H')) + self.assertEqual(self.rxn_1.ts_species.ts_guesses[1].initial_xyz['symbols'], + ('C', 'C', 'O', 'H', 'H', 'H', 'H', 'H')) + self.assertEqual(len(self.rxn_1.ts_species.ts_guesses[1].initial_xyz['coords']), 8) + self.assertEqual(self.rxn_1.ts_species.ts_guesses[0].method, 'kinbot') + self.assertEqual(self.rxn_1.ts_species.ts_guesses[1].method, 'kinbot') + self.assertEqual(self.rxn_1.ts_species.ts_guesses[0].method_index, 0) + self.assertEqual(self.rxn_1.ts_species.ts_guesses[1].method_index, 1) + self.assertEqual(self.rxn_1.ts_species.ts_guesses[0].method_direction, 'F') + self.assertEqual(self.rxn_1.ts_species.ts_guesses[1].method_direction, 'R') + self.assertTrue(self.rxn_1.ts_species.ts_guesses[0].success) + self.assertTrue(self.rxn_1.ts_species.ts_guesses[1].success) + + def test_subprocess_failure(self): + """Test that a failed KinBot subprocess results in no TS guesses without raising.""" + adapter = self.get_adapter(dir_name='tst2') + + def fake_failing_run(python_executable, script_path, *script_args, check=False): + # Simulate a crashed worker: non-zero return code, no output file written. + return subprocess.CompletedProcess(args=[], returncode=1, stdout='', stderr='kinbot crashed') + + with mock.patch.object(kinbot_ts, 'KINBOT_PYTHON', kinbot_ts.__file__), \ + mock.patch.object(kinbot_ts, 'run_in_conda_env', side_effect=fake_failing_run): + adapter.execute_incore() + self.assertEqual(len(self.rxn_1.ts_species.ts_guesses), 0) + + def test_parse_kinbot_results(self): + """Test parsing worker results: duplicates are merged, failures are kept as unsuccessful guesses.""" + adapter = self.get_adapter(dir_name='tst3') + self.rxn_1.ts_species = ARCSpecies(label='TS', is_ts=True, charge=0, multiplicity=2) + species_to_explore = {'F': self.rxn_1.r_species[0], 'R': self.rxn_1.p_species[0]} + coords_f = [list(coord) for coord in self.rxn_1.r_species[0].get_xyz()['coords']] + results = [{'direction': 'F', 'success': True, 'coords': coords_f, 'execution_time': '0:00:00.05'}, + {'direction': 'F', 'success': True, 'coords': coords_f, 'execution_time': '0:00:00.05'}, # duplicate + {'direction': 'R', 'success': False, 'coords': None, 'execution_time': '0:00:00.01', + 'error': 'ValueError: kinbot could not modify the geometry'}, + {'direction': 'bogus', 'success': True, 'coords': coords_f, 'execution_time': '0:00:00.01'}, + ] + adapter.parse_kinbot_results(rxn=self.rxn_1, results=results, species_to_explore=species_to_explore) + ts_guesses = self.rxn_1.ts_species.ts_guesses + self.assertEqual(len(ts_guesses), 2) # The duplicate and the bogus direction were not appended. + self.assertTrue(ts_guesses[0].success) + self.assertEqual(ts_guesses[0].method, 'kinbot') + self.assertEqual(ts_guesses[0].method_direction, 'F') + self.assertEqual(ts_guesses[0].method_index, 0) + self.assertFalse(ts_guesses[1].success) + self.assertEqual(ts_guesses[1].method_direction, 'R') + self.assertEqual(ts_guesses[1].method_index, 1) + + def test_uma_refinement_flag(self): + """Test that the UMA settings cross the boundary and that refined guesses are marked in the method.""" + adapter = self.get_adapter(dir_name='tst4') + captured = dict() + + def fake_run_in_conda_env(python_executable, script_path, *script_args, check=False): + input_dict = read_yaml_file(path=script_args[1]) + captured['input_dict'] = input_dict + f_well = next(well for well in input_dict['wells'] if well['direction'] == 'F') + r_well = next(well for well in input_dict['wells'] if well['direction'] == 'R') + results = [{'direction': 'F', 'success': True, + 'coords': kinbot_list_to_coords(f_well['structure']), + 'execution_time': '0:00:01.5', 'uma_refined': True}, + {'direction': 'R', 'success': True, + 'coords': kinbot_list_to_coords(r_well['structure']), + 'execution_time': '0:00:01.5', 'uma_refined': False, + 'uma_error': 'The Sella saddle-point search on the UMA potential did not converge.'}, + ] + save_yaml_file(path=input_dict['yml_out_path'], content=results) + return subprocess.CompletedProcess(args=[], returncode=0, stdout='', stderr='') + + uma_on = dict(kinbot_ts.KINBOT_UMA_SETTINGS) + uma_on['refine'] = True + with mock.patch.object(kinbot_ts, 'KINBOT_PYTHON', kinbot_ts.__file__), \ + mock.patch.object(kinbot_ts, 'KINBOT_UMA_SETTINGS', uma_on), \ + mock.patch.object(kinbot_ts, 'run_in_conda_env', side_effect=fake_run_in_conda_env): + adapter.execute_incore() + + self.assertTrue(captured['input_dict']['uma']['refine']) + ts_guesses = self.rxn_1.ts_species.ts_guesses + self.assertEqual(len(ts_guesses), 2) + # A UMA-refined guess is distinguishable, and both variants keep containing 'kinbot'. + self.assertEqual(ts_guesses[0].method, 'kinbot-uma') + self.assertEqual(ts_guesses[1].method, 'kinbot') + self.assertTrue(all('kinbot' in tsg.method for tsg in ts_guesses)) + + @unittest.skipIf(kinbot_ts.KINBOT_PYTHON is None or not os.path.isfile(kinbot_ts.KINBOT_PYTHON), + 'A kinbot_env installation is required for this test') + def test_template_modification_end_to_end(self): """ - A function that is run ONCE after all unit tests in this class. - Delete all project directories created during these unit tests. + Test, against a real kinbot_env, that the constraint templates are actually applied: + the TS guess geometries must differ from the well geometries they started from. + This exercises non-empty get_constraints() 'change' lists for the intra_H_migration + family through the full per-family step sequence (a regression test for calling + get_constraints() with a step beyond the family's max_step, which yields empty + constraints and returns the unmodified well geometry). """ - shutil.rmtree(os.path.join(ARC_TESTING_PATH, 'test_KinBot'), ignore_errors=True) + adapter = self.get_adapter(dir_name='tst_e2e') + adapter.execute_incore() + ts_guesses = [tsg for tsg in self.rxn_1.ts_species.ts_guesses if tsg.success] + self.assertGreaterEqual(len(ts_guesses), 2) + well_xyzs = {'F': self.rxn_1.r_species[0].get_xyz(), 'R': self.rxn_1.p_species[0].get_xyz()} + directions = set() + for tsg in ts_guesses: + directions.add(tsg.method_direction) + well_coords = well_xyzs[tsg.method_direction]['coords'] + max_displacement = max(math.dist(guess_coord, well_coord) + for guess_coord, well_coord + in zip(tsg.initial_xyz['coords'], well_coords)) + self.assertGreater(max_displacement, 0.1) + self.assertEqual(directions, {'F', 'R'}) if __name__ == '__main__': diff --git a/arc/job/adapters/ts/kinbot_ts.py b/arc/job/adapters/ts/kinbot_ts.py index d723003946..9c5dbb4b15 100644 --- a/arc/job/adapters/ts/kinbot_ts.py +++ b/arc/job/adapters/ts/kinbot_ts.py @@ -1,45 +1,39 @@ """ An adapter for executing KinBot jobs +KinBot is run in its own dedicated conda environment (``kinbot_env``, see +``devtools/install_kinbot.sh``) via the standalone worker script +``arc/job/adapters/scripts/kinbot_script.py``, which is invoked through +``run_in_conda_env`` (like the GCN and AutoTST adapters). + https://github.com/zadorlab/KinBot """ import datetime +import os from typing import TYPE_CHECKING -from arc.common import almost_equal_coords, get_logger +from arc.common import ARC_PATH, almost_equal_coords, get_logger, read_yaml_file, save_yaml_file +from arc.imports import settings from arc.job.adapter import JobAdapter from arc.job.adapters.common import _initialize_adapter +from arc.job.env_run import run_in_conda_env from arc.job.factory import register_job_adapter from arc.plotter import save_geo from arc.species.converter import xyz_from_data, xyz_to_kinbot_list from arc.species.species import ARCSpecies, TSGuess, colliding_atoms -HAS_KINBOT = True -try: - from kinbot.modify_geom import modify_coordinates - from kinbot.reaction_finder import ReactionFinder - from kinbot.reaction_generator import ReactionGenerator - from kinbot.parameters import Parameters - from kinbot.qc import QuantumChemistry - from kinbot.stationary_pt import StationaryPoint -except (ImportError, ModuleNotFoundError): - HAS_KINBOT = False - if TYPE_CHECKING: from arc.level import Level - from arc.molecule import Molecule from arc.reaction import ARCReaction from arc.species import ARCSpecies -logger = get_logger() +KINBOT_PYTHON = settings['KINBOT_PYTHON'] +KINBOT_UMA_SETTINGS = settings['kinbot_uma_settings'] +KINBOT_SCRIPT_PATH = os.path.join(ARC_PATH, 'arc', 'job', 'adapters', 'scripts', 'kinbot_script.py') -if not HAS_KINBOT: - # Create a dummy class to properly compile this module if KinBot is missing. - class ReactionGenerator(object): - def __init__(self, species, par, qc, input_file): - self.species, self.par, self.qc, self.input_file = species, par, qc, input_file +logger = get_logger() class KinBotAdapter(JobAdapter): @@ -136,7 +130,7 @@ def __init__(self, self.incore_capacity = 20 self.job_adapter = 'kinbot' self.execution_type = execution_type or 'incore' - self.command = None # KinBot does not have an executable file, just an API. + self.command = 'kinbot_script.py' self.url = 'https://github.com/zadorlab/KinBot' self.family_map = {'1+2_Cycloaddition': ['r12_cycloaddition'], @@ -242,7 +236,8 @@ def set_additional_file_paths(self) -> None: Set additional file paths specific for the adapter. Called from set_file_paths() and extends it. """ - pass + self.yml_in_path = os.path.join(self.local_path, 'input.yml') + self.yml_out_path = os.path.join(self.local_path, 'output.yml') def set_input_file_memory(self) -> None: """ @@ -254,9 +249,11 @@ def execute_incore(self): """ Execute a job incore. """ - if not HAS_KINBOT: - raise ModuleNotFoundError(f'Could not import KinBot, make sure it is properly installed.\n' - f'See {self.url} for more information, or use the Makefile provided with ARC.') + if not KINBOT_PYTHON or not os.path.isfile(KINBOT_PYTHON): + raise FileNotFoundError('The KinBot python executable was not found. ' + 'Make sure the kinbot_env exists and KINBOT_PYTHON is configured ' + '(run devtools/install_kinbot.sh to create the environment). ' + f'See {self.url} for more information.') self._log_job_execution() self.initial_time = self.initial_time if self.initial_time else datetime.datetime.now() @@ -282,67 +279,9 @@ def execute_incore(self): f'Got {len(rxn.r_species)} reactants and {rxn.p_species} products in\n{rxn}.') continue - method_index = 0 - for method_direction, spc in species_to_explore.items(): - symbols = spc.get_xyz()['symbols'] - for m, mol in enumerate(spc.mol_list): - reaction_generator = set_up_kinbot(mol=mol, - families=self.family_map[rxn.family], - kinbot_xyz=xyz_to_kinbot_list(spc.get_xyz()), - multiplicity=rxn.multiplicity, - charge=rxn.charge, - ) - for r, kinbot_rxn in enumerate(reaction_generator.species.reac_obj): - step, fix, change, release = kinbot_rxn.get_constraints(step=20, - geom=kinbot_rxn.species.geom) - ts_guess = TSGuess(method=f'KinBot', - method_direction=method_direction, - method_index=method_index, - index=len(rxn.ts_species.ts_guesses), - ) - ts_guess.tic() - - change_starting_zero = list() - for c in change: - c_new = [ci - 1 for ci in c[:-1]] - c_new.append(c[-1]) - change_starting_zero.append(c_new) - - success, coords = modify_coordinates(species=kinbot_rxn.species, - name=kinbot_rxn.instance_name, - geom=kinbot_rxn.species.geom, - changes=change_starting_zero, - bond=kinbot_rxn.species.bond, - ) - - ts_guess.tok() - unique = True - - if success: - ts_guess.success = True - xyz = xyz_from_data(coords=coords, symbols=symbols) - if xyz is None or colliding_atoms(xyz): - success = False - else: - for other_tsg in rxn.ts_species.ts_guesses: - if other_tsg.success and almost_equal_coords(xyz, other_tsg.initial_xyz): - if 'kinbot' not in other_tsg.method.lower(): - other_tsg.method += ' and KinBot' - unique = False - break - if unique: - ts_guess.process_xyz(xyz) - save_geo(xyz=xyz, - path=self.local_path, - filename=f'KinBot {method_direction} {method_index}', - format_='xyz', - comment=f'KinBot {method_direction} {method_index}' - ) - if not success: - ts_guess.success = False - if unique: - rxn.ts_species.ts_guesses.append(ts_guess) - method_index += 1 + self.write_kinbot_input_file(rxn=rxn, species_to_explore=species_to_explore) + results = self.run_kinbot_subprocess(rxn=rxn) + self.parse_kinbot_results(rxn=rxn, results=results, species_to_explore=species_to_explore) if len(self.reactions) < 5: successes = len([tsg for tsg in rxn.ts_species.ts_guesses if tsg.success and 'kinbot' in tsg.method]) @@ -353,6 +292,123 @@ def execute_incore(self): self.final_time = datetime.datetime.now() + def write_kinbot_input_file(self, + rxn: 'ARCReaction', + species_to_explore: dict, + ) -> None: + """ + Write the YAML input file for the KinBot worker script. + + Args: + rxn (ARCReaction): The reaction to explore. + species_to_explore (dict): Keys are 'F'/'R' directions, values are the respective ARCSpecies. + """ + wells = list() + for method_direction, spc in species_to_explore.items(): + kinbot_xyz = xyz_to_kinbot_list(spc.get_xyz()) + for mol in spc.mol_list: + wells.append({'direction': method_direction, + 'smiles': mol.to_smiles(), + 'structure': kinbot_xyz, + }) + input_dict = {'charge': rxn.charge, + 'multiplicity': rxn.multiplicity, + 'families': self.family_map[rxn.family], + 'wells': wells, + 'uma': dict(KINBOT_UMA_SETTINGS), + 'yml_out_path': self.yml_out_path, + } + save_yaml_file(path=self.yml_in_path, content=input_dict) + + def run_kinbot_subprocess(self, rxn: 'ARCReaction') -> list: + """ + Run the KinBot worker script in the kinbot_env environment and read its results. + + Args: + rxn (ARCReaction): The reaction being explored (only used for logging). + + Returns: + list: The TS guess result dictionaries read from the worker's output file. + """ + if os.path.isfile(self.yml_out_path): + # Remove a stale output file, so a failed subprocess isn't mistaken for fresh results. + os.remove(self.yml_out_path) + # Routed via run_in_conda_env so arc_env's activation vars don't + # leak into the child (see arc/job/env_run.py). + output = run_in_conda_env(KINBOT_PYTHON, KINBOT_SCRIPT_PATH, + '--yml_in_path', self.yml_in_path, + ) + if output.returncode: + logger.warning(f'The KinBot subprocess did not give a successful return code for {rxn}.\n' + f'Got return code: {output.returncode}\n' + f'stdout: {output.stdout}\n' + f'stderr: {output.stderr}') + results = read_yaml_file(path=self.yml_out_path) if os.path.isfile(self.yml_out_path) else list() + return results or list() + + def parse_kinbot_results(self, + rxn: 'ARCReaction', + results: list, + species_to_explore: dict, + ) -> None: + """ + Parse the KinBot worker results into TSGuess objects on the reaction's TS species. + + Args: + rxn (ARCReaction): The reaction to which the results belong. + results (list): The TS guess result dictionaries from the worker script. + species_to_explore (dict): Keys are 'F'/'R' directions, values are the respective ARCSpecies. + """ + method_index = 0 + for result in results: + method_direction = result.get('direction') + if method_direction not in species_to_explore: + logger.warning(f'Got an unexpected direction {method_direction} from the KinBot ' + f'worker script for {rxn}, skipping this entry.') + continue + if result.get('error'): + logger.warning(f'The KinBot worker script reported an error for {rxn} ' + f'in the {method_direction} direction:\n{result["error"]}') + symbols = species_to_explore[method_direction].get_xyz()['symbols'] + # Record UMA refinement in the method name so downstream/benchmark analysis can + # distinguish refined from raw template guesses. Both variants contain 'kinbot', + # which keeps the existing dedup and success-count logic intact. + method = 'KinBot-UMA' if result.get('uma_refined') else 'KinBot' + ts_guess = TSGuess(method=method, + method_direction=method_direction, + method_index=method_index, + index=len(rxn.ts_species.ts_guesses), + execution_time=result.get('execution_time'), + ) + success = bool(result.get('success')) and result.get('coords') is not None + unique = True + + if success: + ts_guess.success = True + xyz = xyz_from_data(coords=result['coords'], symbols=symbols) + if xyz is None or colliding_atoms(xyz): + success = False + else: + for other_tsg in rxn.ts_species.ts_guesses: + if other_tsg.success and almost_equal_coords(xyz, other_tsg.initial_xyz): + if 'kinbot' not in other_tsg.method.lower(): + other_tsg.method += ' and KinBot' + unique = False + break + if unique: + ts_guess.process_xyz(xyz) + save_geo(xyz=xyz, + path=self.local_path, + filename=f'KinBot {method_direction} {method_index}', + format_='xyz', + comment=f'KinBot {method_direction} {method_index}', + ) + if not success: + ts_guess.success = False + if unique: + rxn.ts_species.ts_guesses.append(ts_guess) + method_index += 1 + def execute_queue(self): """ (Execute a job to the server's queue.) @@ -361,67 +417,4 @@ def execute_queue(self): self.execute_incore() -def set_up_kinbot(mol: 'Molecule', - families: list[str], - kinbot_xyz: list[str | float], - multiplicity: int, - charge: int, - ) -> ReactionGenerator: - """ - This will set up KinBot to run for a unimolecular reaction starting from the single reactant side. - - Args: - mol (Molecule): The RMG Molecule instance representing the unimolecular well to react. - families (list[str]): The specific KinBot families to try. - kinbot_xyz (list): The cartesian coordinates of the well in the KinBot list format. - multiplicity (int): The well/reaction multiplicity. - charge (int): The well/reaction charge. - - Returns: - ReactionGenerator: The KinBot ReactionGenerator instance. - """ - params = Parameters() - params.par['title'] = 'ARC' - # molecule information - params.par['smiles'] = mol.to_smiles() - params.par['structure'] = kinbot_xyz - params.par['charge'] = charge - params.par['mult'] = multiplicity - params.par['dimer'] = 0 - # steps - params.par['reaction_search'] = 1 - params.par['families'] = families - params.par['homolytic_scissions'] = 0 - params.par['pes'] = 0 - params.par['high_level'] = 0 - params.par['conformer_search'] = 0 - params.par['me'] = 0 - # params.par['one_reaction_fam'] = 1 - params.par['ringrange'] = [3, 9] - - well = StationaryPoint(name='well0', - charge=charge, - mult=multiplicity, - structure=kinbot_xyz, - ) - - well.calc_chemid() - well.bond_mx() - well.find_cycle() - well.find_atom_eqv() - well.find_conf_dihedral() - - qc = QuantumChemistry(params.par) - rxn_finder = ReactionFinder(well, params.par, qc) - rxn_finder.find_reactions() - - reaction_generator = ReactionGenerator(species=well, - par=params.par, - qc=qc, - input_file=None, - ) - - return reaction_generator - - register_job_adapter('kinbot', KinBotAdapter) diff --git a/arc/job/adapters/ts/orca_neb.py b/arc/job/adapters/ts/orca_neb.py index 0647fd3169..46705bb417 100644 --- a/arc/job/adapters/ts/orca_neb.py +++ b/arc/job/adapters/ts/orca_neb.py @@ -17,7 +17,7 @@ from arc.job.adapters.orca import OrcaAdapter, _format_orca_method, _format_orca_basis from arc.job.factory import register_job_adapter from arc.job.local import execute_command -from arc.level import Level +from arc.level import Level, plain_level_dict from arc.parser.parser import parse_geometry from arc.species import TSGuess from arc.species.converter import xyz_to_xyz_file_format @@ -39,15 +39,15 @@ %%maxcore ${memory} %%pal nprocs ${cpus} end -%%neb +%%neb Interpolation ${interpolation} NImages ${nnodes} PrintLevel 3 PreOpt ${preopt} - NEB_END_XYZFILE "${abs_path}/product.xyz" + NEB_END_XYZFILE "product.xyz" END -* XYZFILE ${charge} ${multiplicity} ${abs_path}/reactant.xyz +* XYZFILE ${charge} ${multiplicity} reactant.xyz """ @@ -222,7 +222,6 @@ def write_input_file(self) -> None: input_dict['cpus'] = self.cpu_cores input_dict['charge'] = self.charge input_dict['multiplicity'] = self.multiplicity - input_dict['abs_path'] = self.local_path # NEB specific parameters neb_settings = orca_neb_settings.get('keyword', {}) @@ -321,6 +320,7 @@ def process_run(self): index=len(self.reactions[0].ts_species.ts_guesses), success=False, t0=self.initial_time, + level=plain_level_dict(self.level), ) if os.path.isfile(self.local_path_to_output_file): tsg.initial_xyz = parse_geometry(self.local_path_to_output_file) diff --git a/arc/job/adapters/ts/seed_hub.py b/arc/job/adapters/ts/seed_hub.py new file mode 100644 index 0000000000..4a38254cdb --- /dev/null +++ b/arc/job/adapters/ts/seed_hub.py @@ -0,0 +1,168 @@ +""" +Shared TS-seed and wrapper-constraint hub. + +This module centralizes: +1. How TS seeds are requested from a base TS-search adapter. +2. How wrapper adapters (e.g., CREST) request family-specific constraints for a seed. +""" + +from typing import Dict, List, Optional + +from arc.common import get_logger +from arc.species.converter import xyz_to_dmat + +logger = get_logger() + + +def get_ts_seeds(reaction: 'ARCReaction', + base_adapter: str = 'heuristics', + dihedral_increment: Optional[int] = None, + ) -> List[dict]: + """ + Return TS seed entries from a base TS-search adapter. + + Seed schema: + - ``xyz`` (dict): Cartesian coordinates. + - ``family`` (str): The family associated with this seed. + - ``method`` (str): Human-readable generator label. + - ``source_adapter`` (str): Adapter id that generated the seed. + - ``metadata`` (dict, optional): Adapter-specific auxiliary fields. + + Args: + reaction: The ARC reaction object. + base_adapter: The underlying TS-search adapter providing seeds. + dihedral_increment: Optional scan increment used by adapters that support it. + """ + adapter = (base_adapter or '').lower() + if adapter != 'heuristics': + raise ValueError(f'Unsupported TS seed base adapter: {base_adapter}') + + # Lazily import to avoid circular imports with heuristics.py. + from arc.job.adapters.ts.heuristics import FAMILY_SETS, h_abstraction, hydrolysis + + xyz_entries = list() + if reaction.family == 'H_Abstraction': + xyzs = h_abstraction(reaction=reaction, dihedral_increment=dihedral_increment) + for entry in xyzs: + xyz = entry.get('xyz') if isinstance(entry, dict) else entry + method = entry.get('method', 'Heuristics') if isinstance(entry, dict) else 'Heuristics' + if xyz is not None: + xyz_entries.append({ + 'xyz': xyz, + 'method': method, + 'family': reaction.family, + 'source_adapter': 'heuristics', + 'metadata': {}, + }) + elif reaction.family in FAMILY_SETS['hydrolysis_set_1'] or reaction.family in FAMILY_SETS['hydrolysis_set_2']: + try: + xyzs_raw, families, indices = hydrolysis(reaction=reaction) + xyz_entries = [{ + 'xyz': xyz, + 'method': 'Heuristics', + 'family': family, + 'source_adapter': 'heuristics', + 'metadata': {'indices': idx}, + } for xyz, family, idx in zip(xyzs_raw, families, indices)] + except ValueError: + xyz_entries = list() + return xyz_entries + + +def get_wrapper_constraints(wrapper: str, + reaction: 'ARCReaction', + seed: dict, + ) -> Optional[dict]: + """ + Return wrapper-specific constraints for a TS seed. + + Args: + wrapper: Wrapper adapter id (e.g., ``crest``). + reaction: The ARC reaction object. + seed: A seed entry returned by :func:`get_ts_seeds`. + """ + wrapper_name = (wrapper or '').lower() + if wrapper_name != 'crest': + raise ValueError(f'Unsupported wrapper adapter: {wrapper}') + return _get_crest_constraints(reaction=reaction, seed=seed) + + +def _get_crest_constraints(reaction: 'ARCReaction', seed: dict) -> Optional[Dict[str, int]]: + """ + Return CREST constraints for a seed. + + Currently, only H_Abstraction is supported. + """ + family = seed.get('family') or reaction.family + xyz = seed.get('xyz') + if family != 'H_Abstraction' or xyz is None: + return None + return _get_h_abs_atoms_from_xyz(xyz) + + +def _get_h_abs_atoms_from_xyz(xyz: dict) -> Optional[Dict[str, int]]: + """ + Determine H-abstraction atoms from a TS guess. + + Returns: + Optional[Dict[str, int]]: ``{'H': int, 'A': int, 'B': int}``, or ``None``. + """ + symbols = xyz.get('symbols') if isinstance(xyz, dict) else None + if not symbols: + return None + dmat = xyz_to_dmat(xyz) + if dmat is None: + return None + + closest_atoms = dict() + for i in range(len(symbols)): + nearest = sorted( + ((dmat[i][j], j) for j in range(len(symbols)) if j != i), + key=lambda x: x[0], + )[:2] + closest_atoms[i] = [idx for _, idx in nearest] + + hydrogen_indices = [i for i, symbol in enumerate(symbols) if symbol.startswith('H')] + condition_occurrences = list() + + for hydrogen_index in hydrogen_indices: + atom_neighbors = closest_atoms[hydrogen_index] + is_heavy_present = any(not symbols[atom].startswith('H') for atom in atom_neighbors) + if_hydrogen_present = any(symbols[atom].startswith('H') and atom != hydrogen_index for atom in atom_neighbors) + + if is_heavy_present and if_hydrogen_present: + condition_occurrences.append({'H': hydrogen_index, 'A': atom_neighbors[0], 'B': atom_neighbors[1]}) + + if condition_occurrences: + if len(condition_occurrences) > 1: + occurrence_distances = list() + for occurrence in condition_occurrences: + h_atom = occurrence['H'] + a_atom = occurrence['A'] + b_atom = occurrence['B'] + occurrence_distances.append((occurrence, dmat[h_atom][a_atom] + dmat[h_atom][b_atom])) + best_occurrence = min(occurrence_distances, key=lambda x: x[1])[0] + return {'H': best_occurrence['H'], 'A': best_occurrence['A'], 'B': best_occurrence['B']} + single_occurrence = condition_occurrences[0] + return {'H': single_occurrence['H'], 'A': single_occurrence['A'], 'B': single_occurrence['B']} + + min_distance = float('inf') + selected_hydrogen = None + selected_heavy_atoms = None + for hydrogen_index in hydrogen_indices: + atom_neighbors = closest_atoms[hydrogen_index] + heavy_atoms = [atom for atom in atom_neighbors if not symbols[atom].startswith('H')] + if len(heavy_atoms) < 2: + continue + distances = dmat[hydrogen_index][heavy_atoms[0]] + dmat[hydrogen_index][heavy_atoms[1]] + if distances < min_distance: + min_distance = distances + selected_hydrogen = hydrogen_index + selected_heavy_atoms = heavy_atoms + + if selected_hydrogen is not None and selected_heavy_atoms is not None: + return {'H': selected_hydrogen, 'A': selected_heavy_atoms[0], 'B': selected_heavy_atoms[1]} + + logger.warning('No valid hydrogen atom found for CREST H-abstraction atoms.') + return None + diff --git a/arc/job/adapters/ts/xtb_gsm.py b/arc/job/adapters/ts/xtb_gsm.py index bcf7d73bda..6f32796872 100644 --- a/arc/job/adapters/ts/xtb_gsm.py +++ b/arc/job/adapters/ts/xtb_gsm.py @@ -25,7 +25,7 @@ from arc.job.adapters.common import _initialize_adapter from arc.job.factory import register_job_adapter from arc.job.local import change_mode, execute_command -from arc.level import Level +from arc.level import Level, plain_level_dict from arc.parser.parser import parse_trajectory from arc.species import TSGuess from arc.species.converter import xyz_to_xyz_file_format @@ -306,6 +306,13 @@ def set_additional_file_paths(self) -> None: self.tm2orca_path = os.path.join(self.local_path, 'tm2orca.py') self.scratch_initial0000_path = os.path.join(self.local_path, 'scratch', 'initial0000.xyz') self.stringfile_path = os.path.join(self.local_path, 'stringfile.xyz0000') + # Side-effect directory written by the patched ``ograd`` wrapper. + # Holds per-node ``