lit_mlflow.callback

[docs] module lit_mlflow.callback
import osfrom pathlib importPathimport tempfilefrom typing importAny,castfrom lightning.fabric.utilities.rank_zero import(rank_zero_info,# type: ignore  # noqa: PGH003rank_zero_only,rank_zero_warn,# type: ignore  # noqa: PGH003)import lightning.pytorch as plfrom lightning.pytorch.callbacks importCallback,DeviceStatsMonitor,EarlyStoppingfrom lightning.pytorch.core.optimizer importLightningOptimizerfrom lightning.pytorch.loggers importLogger,MLFlowLoggerfrom lightning.pytorch.loggers.mlflow import_get_resolve_tagsfrom lightning.pytorch.trainer.states importTrainerFnfrom lightning.pytorch.utilities.model_summary.model_summary importModelSummaryfrom lightning.pytorch.utilities.types importSTEP_OUTPUTimport mlflowfrom mlflow importMlflowClient,MlflowExceptionfrom mlflow.entities.dataset importDatasetfrom mlflow.entities.dataset_input importDatasetInputfrom mlflow.entities.input_tag importInputTagfrom mlflow.entities.run importRunfrom mlflow.entities.run_status importRunStatusfrom mlflow.models importModelfrom mlflow.pytorch importpickle_moduleasmlflow_pytorch_pickle_modulefrom torch.optim importOptimizer# pyright: ignore[reportPrivateImportUsage]from lit_mlflow.logger importDbxMLFlowLoggerfrom lit_mlflow.utils.dbx importget_databricks_tagsclass MlFlowAutoCallback(Callback):[docs]
def __init__(self,verbose:bool=True,patch_device_monitor:bool=True)->None:self.supported_loggers=(MLFlowLogger,DbxMLFlowLogger)self.verbose=verboseself.logger:MLFlowLogger|DbxMLFlowLogger|None=Noneself.autologging_disabled=Falseself.patch_device_monitor=patch_device_monitor@propertydef client(self)->MlflowClient|None:[docs]
ifself.logger:returnself.logger.experimentreturnNonedef _get_logger(self,loggers:list[Logger])->MLFlowLogger|DbxMLFlowLogger|None:ifisinstance(loggers,list):iflen(loggers)==0:rank_zero_warn("Cannot log artifacts because Trainer has no logger.")returnNoneelse:rank_zero_info(f" Supported loggers are: {', '.join(str(x.__name__) for x in self.supported_loggers)}")forloggerinloggers:ifisinstance(logger,self.supported_loggers):returnloggerelse:rank_zero_warn(f"{self.__class__.__name__} does not support logging with {logger.__class__.__name__}.")returnNonedef _prevent_entry(self,trainer:"pl.Trainer")->bool:returnself.loggerisNoneornottrainer.is_global_zerodef _get_optimizer(self,optimizer:LightningOptimizer|Optimizer)->Optimizer:returnoptimizer._optimizerifisinstance(optimizer,LightningOptimizer)elseoptimizerdef _get_optimizer_name(self,optimizer:LightningOptimizer|Optimizer)->str:opt=self._get_optimizer(optimizer)returnopt.__class__.__name__def _get_optimizer_defaults(self,optimizer:LightningOptimizer|Optimizer)->dict[str,Any]:opt=self._get_optimizer(optimizer)returnopt.defaultsifhasattr(opt,"defaults")else{}def _log_early_stop_params(self,early_stop_callback:EarlyStopping)->None:        """Logs early stopping configuration parameters to MLflow."""ifself.loggerisNone:returnNoneparams={p:getattr(early_stop_callback,p)forpin["monitor","mode","patience","min_delta","stopped_epoch"]ifhasattr(early_stop_callback,p)}self.logger.log_hyperparams(params)def _log_early_stop_metrics(self,early_stop_callback:EarlyStopping)->None:        """Logs early stopping behavior results (e.g. stopped epoch) as metrics to MLflow."""ifself.loggerisNone:returnNoneifearly_stop_callbackisNoneorearly_stop_callback.stopped_epoch==0:returnNonemetrics:dict[str,float]={"stopped_epoch":early_stop_callback.stopped_epoch,"restored_epoch":early_stop_callback.stopped_epoch-max(1,early_stop_callback.patience),}ifhasattr(early_stop_callback,"best_score"):metrics["best_score"]=float(early_stop_callback.best_score)ifhasattr(early_stop_callback,"wait_count"):metrics["wait_count"]=early_stop_callback.wait_countself.logger.log_metrics(metrics)def _resolve_early_stopping_callback(self,trainer:"pl.Trainer")->EarlyStopping|None:ifhasattr(trainer,"callbacks"):forcallbackincast(list[Callback],trainer.callbacks):# pyright: ignore[reportAttributeAccessIssue]ifisinstance(callback,EarlyStopping):returncallbackreturnNonedef _log_model_summary(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:summary=str(ModelSummary(pl_module,max_depth=-1))artifact_path="model_summary.txt"ifself.loggerandself.logger._run_idandself.client:withtempfile.TemporaryDirectory(prefix="test",suffix="test",dir=Path.cwd())astmp_dir:withPath.open(Path(f"{tmp_dir}/{artifact_path}"),"w")astmp_file_summary:tmp_file_summary.write(summary)run_id=str(self.logger.run_id)self.client.log_artifacts(run_id,tmp_dir,artifact_path)def _log_model(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:ifself.loggerandself.logger.run_idandself.client:rank_zero_info("Saving the model and uploading to MLFlow!")withtempfile.TemporaryDirectory(prefix="test",suffix="test",dir=Path.cwd())astmp_dir:local_path=Path(tmp_dir)/"model"artifact_path="model"mlflow_model=Model(artifact_path=artifact_path,run_id=self.logger.run_id)mlflow.pytorch.save_model(pytorch_model=pl_module,path=local_path,conda_env=None,mlflow_model=mlflow_model,code_paths=None,pickle_module=mlflow_pytorch_pickle_module,signature=None,input_example=None,requirements_file=None,extra_files=None,pip_requirements=None,extra_pip_requirements=None,)self.client.log_artifacts(run_id=self.logger.run_id,local_dir=tmp_dir,artifact_path=artifact_path,)try:self.client._record_logged_model(run_id=self.logger.run_id,mlflow_model=mlflow_model)exceptMlflowException:rank_zero_warn(f"Logging model metadata to the tracking server {self.logger._tracking_uri} has failed")# info = mlflow_model.get_model_info()returnNonedef _print_auto_logged_info(self)->None:ifself.loggerandself.logger.run_idandself.client:run=mlflow.get_run(run_id=self.logger.run_id)ifrun:artifacts=[f.pathforfinself.client.list_artifacts(run.info.run_id,"model")]tags={k:vfork,vinrun.data.tags.items()ifnotk.startswith("mlflow.")}rank_zero_info(f"run_id: {run.info.run_id}")rank_zero_info(f"artifacts: {artifacts}")rank_zero_info(f"params: {run.data.params}")rank_zero_info(f"metrics: {run.data.metrics}")rank_zero_info(f"tags: {tags}")def _log_cluster_tags(self)->None:tags=get_databricks_tags()ifself.loggerandself.logger.run_idandself.client:fortag,valueintags.items():self.client.set_tag(self.logger.run_id,key=tag,value=value)def _log_dataset_info(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:ifhasattr(trainer,"datamodule")andtrainer.datamodule:# pyright: ignore[reportAttributeAccessIssue]dm=trainer.datamodule.train_dataloader()# pyright: ignore[reportAttributeAccessIssue]dataset=dm.dataset# pyright: ignore[reportAttributeAccessIssue]ifself.loggerandself.logger.run_idandself.client:meta_ds=Dataset(name=dataset.__class__.__name__,digest="",source=dm.__class__.__name__,source_type="",)ds_input=DatasetInput(dataset=meta_ds,tags=[InputTag("class",dataset.__class__.__name__),],)self.client.log_inputs(run_id=self.logger.run_id,datasets=[ds_input])def _patch_device_stats_monitor(self,trainer:"pl.Trainer")->None:def _patched_prefix_metric_keys(metrics_dict:dict[str,float],prefix:str,separator:str)->dict[str,float]:return{prefix+separator+k:vfork,vinmetrics_dict.items()}def _patched_get_and_log_device_stats(self,trainer:"pl.Trainer",key:str)->None:ifnottrainer._logger_connector.should_update_logs:returndevice=trainer.strategy.root_deviceifself._cpu_statsisFalseanddevice.type=="cpu":# cpu stats are disabledreturndevice_stats=trainer.accelerator.get_device_stats(device)ifself._cpu_statsanddevice.type!="cpu":# Don't query CPU stats twice if CPU is acceleratorfrom lightning.pytorch.accelerators.cpu importget_cpu_statsdevice_stats.update(get_cpu_stats())forloggerintrainer.loggers:separator=logger.group_separatorprefixed_device_stats=_patched_prefix_metric_keys(device_stats,f"system/{key}",separator)logger.log_metrics(prefixed_device_stats,step=trainer.fit_loop.epoch_loop._batches_that_stepped)patched=Falseifhasattr(trainer,"callbacks"):forcallbackincast(list[Callback],trainer.callbacks):# pyright: ignore[reportAttributeAccessIssue]ifisinstance(callback,DeviceStatsMonitor):callback._get_and_log_device_stats=_patched_get_and_log_device_stats.__get__(callback,DeviceStatsMonitor)patched=Truerank_zero_info("Lightning device stats monitoring enabled!")ifnotpatched:rank_zero_info("Lightning device stats monitor has not been added to callbacks!")@rank_zero_only[docs]
def setup(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",stage:str)->None:        """Called when fit, validate, test, predict, or tune begins."""ifnotself.autologging_disabled:rank_zero_info("Starting MLFlow Databricks logging!")rank_zero_info("Default auto logging disabled!")mlflow.autolog(disable=True)self.autologging_disabled=Trueiftrainer.is_global_zero:self.logger=self._get_logger(trainer.loggers)self._log_cluster_tags()self._log_dataset_info(trainer,pl_module)ifself.patch_device_monitor:self._patch_device_stats_monitor(trainer)ifnotself._prevent_entry(trainer)andself.loggerandself.logger.run_idandself.client:self.client.update_run(run_id=self.logger.run_id,status=RunStatus.to_string(RunStatus.RUNNING))@rank_zero_only[docs]
def teardown(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",stage:str)->None:        """Called when fit, validate, test, predict, or tune ends."""ifself._prevent_entry(trainer):returnNoneifself.loggerandself.logger.run_idandself.client:self.client.update_run(run_id=self.logger.run_id,status=RunStatus.to_string(RunStatus.FINISHED))ifstage==TrainerFn.FITTING:self.client.set_terminated(run_id=self.logger.run_id,status=RunStatus.to_string(RunStatus.FINISHED))[docs]
def on_fit_start(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when fit begins."""[docs]
def on_fit_end(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when fit ends."""[docs]
def on_sanity_check_start(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the validation sanity check starts."""[docs]
def on_sanity_check_end(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the validation sanity check ends."""def on_train_batch_start([docs]
self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",batch:Any,batch_idx:int)->None:        """Called when the train batch begins."""def on_train_batch_end([docs]
self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",outputs:STEP_OUTPUT,batch:Any,batch_idx:int)->None:        """Called when the train batch ends.        Note:            The value ``outputs["loss"]`` here will be the normalized value w.r.t ``accumulate_grad_batches`` of the            loss returned from ``training_step``.        """[docs]
def on_train_epoch_start(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the train epoch begins."""[docs]
def on_train_epoch_end(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the train epoch ends.        To access all batch outputs at the end of the epoch, you can cache step outputs as an attribute of the        :class:`lightning.pytorch.core.LightningModule` and access them in this hook:        .. code-block:: python            class MyLightningModule(L.LightningModule):                def __init__(self):                    super().__init__()                    self.training_step_outputs = []                def training_step(self):                    loss = ...                    self.training_step_outputs.append(loss)                    return loss            class MyCallback(L.Callback):                def on_train_epoch_end(self, trainer, pl_module):                    # do something with all training_step outputs, for example:                    epoch_mean = torch.stack(pl_module.training_step_outputs).mean()                    pl_module.log("training_epoch_mean", epoch_mean)                    # free up the memory                    pl_module.training_step_outputs.clear()        """ifself.logger:metrics={str(key):float(value)forkey,valueintrainer.callback_metrics.items()}self.logger.log_metrics(metrics,pl_module.current_epoch)[docs]
def on_validation_epoch_start(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the val epoch begins."""[docs]
def on_validation_epoch_end(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the val epoch ends."""[docs]
def on_test_epoch_start(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the test epoch begins."""[docs]
def on_test_epoch_end(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the test epoch ends."""[docs]
def on_predict_epoch_start(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the predict epoch begins."""[docs]
def on_predict_epoch_end(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the predict epoch ends."""def on_validation_batch_start([docs]
self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",batch:Any,batch_idx:int,dataloader_idx:int=0,)->None:        """Called when the validation batch begins."""def on_validation_batch_end([docs]
self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",outputs:STEP_OUTPUT,batch:Any,batch_idx:int,dataloader_idx:int=0,)->None:        """Called when the validation batch ends."""def on_test_batch_start([docs]
self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",batch:Any,batch_idx:int,dataloader_idx:int=0,)->None:        """Called when the test batch begins."""def on_test_batch_end([docs]
self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",outputs:STEP_OUTPUT,batch:Any,batch_idx:int,dataloader_idx:int=0,)->None:        """Called when the test batch ends."""def on_predict_batch_start([docs]
self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",batch:Any,batch_idx:int,dataloader_idx:int=0,)->None:        """Called when the predict batch begins."""def on_predict_batch_end([docs]
self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",outputs:Any,batch:Any,batch_idx:int,dataloader_idx:int=0,)->None:        """Called when the predict batch ends."""@rank_zero_only[docs]
def on_train_start(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the train begins."""ifself._prevent_entry(trainer):returnNoneifself.loggerandself.logger.run_idandself.client:run_id=str(self.logger.run_id)self.client.set_tag(run_id=run_id,key="Mode",value="training")self.client.log_param(run_id=run_id,key="epochs",value=trainer.max_epochs)ifhasattr(trainer,"optimizers"):fori,optimizerinenumerate(trainer.optimizers):self.client.log_param(self.logger.run_id,key=f"optimizer{i}_name",value=self._get_optimizer_name(optimizer))defaults=self._get_optimizer_defaults(optimizer)forkey,valueindefaults.items():self.client.log_param(self.logger.run_id,key=f"optimizer{i}_{key}",value=str(value))# self.client.log_param(#     self.logger.run_id, key=f"optimizer{i}_defaults", value=str(optimizer.defaults)# )callback=self._resolve_early_stopping_callback(trainer)ifcallback:self._log_early_stop_params(callback)@rank_zero_only[docs]
def on_train_end(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the train ends."""ifself._prevent_entry(trainer):returnNonecallback=self._resolve_early_stopping_callback(trainer)ifcallback:self._log_early_stop_metrics(callback)self._log_model_summary(trainer,pl_module)self._log_model(trainer,pl_module)@rank_zero_only[docs]
def on_validation_start(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the validation loop begins."""ifself.loggerandself.logger.run_idandself.client:run_id=str(self.logger.run_id)self.client.set_tag(run_id=run_id,key="Mode",value="validating")@rank_zero_only[docs]
def on_validation_end(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the validation loop ends."""@rank_zero_only[docs]
def on_test_start(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the test begins."""ifself._prevent_entry(trainer):returnNoneifself.loggerandself.logger.run_idandself.client:self.client.set_tag(self.logger.run_id,key="Mode",value="testing")[docs]
def on_test_end(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the test ends."""# originally, mlflow.autolog changes the mode to testing here, but we do it in on_test_start[docs]
def on_predict_start(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when the predict begins."""[docs]
def on_predict_end(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule")->None:        """Called when predict ends."""@rank_zero_only[docs]
def on_exception(self,trainer:"pl.Trainer",pl_module:"pl.LightningModule",exception:BaseException)->None:        """Called when any trainer execution is interrupted by an exception."""ifself._prevent_entry(trainer):returnifself.loggerandself.logger.run_idandself.client:self.client.set_terminated(run_id=self.logger.run_id,status=RunStatus.to_string(RunStatus.FAILED))