diff --git a/.gitignore b/.gitignore
index 8553d72..edad3f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,23 +1,46 @@
-*h5
+# Python bytecode and caches
+__pycache__/
+*.py[cod]
+*$py.class
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+.coverage
+htmlcov/
+
+# Local environments and build outputs
+.venv/
+venv/
+build/
+dist/
+*.egg-info/
+
+# Notebook and editor metadata
+.ipynb_checkpoints/
+.vscode/
+
+# Dataset and spreadsheet artifacts
+*.h5
+*.hdf5
+*.xlsx
+*.xls
+*.xlxs
+
+# Model checkpoints and generated training artifacts
+*-temp-weights-*
+*.pt
+*.pth
+model.json
+
+# Legacy experiment output folders/files
 Mean
+mean_folder
 onlineGRU
 seq2point
 seq2seq
 rnn
 dae
-disaggregate/__pycache__
-*hdf5
 excess
-.ipynb_checkpoints
-.pycache
-mean_folder
 pre-trained-mean
 prev_disaggregate
-.xlsx
-.xlxs
-__pycache__
-__pycache__/*
-disaggregate/__pycache__/*
-disaggregate/__pycache__/
 buildsys_notebooks
-.vscode
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index d5414c2..f84ef3a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,12 +22,13 @@ RUN pip install --no-cache-dir uv
 # Copy project files (assumes everything is in one dir)
 COPY . .
 
-# Sync dependencies using uv (installs from pyproject.toml)
-RUN uv pip install --system .
+# Install the package with all optional backends. Use a narrower extra such as
+# .[torch], .[tensorflow], or .[classical] for backend-specific production images.
+RUN uv pip install --system ".[all]"
 
 
 # Optional: install dev dependencies too
-# RUN uv pip install .[dev]
+# RUN uv pip install --system ".[dev]"
 
 # Set env vars
 ENV PYTHONUNBUFFERED=1
diff --git a/README.md b/README.md
index a06e8ba..f90bafd 100644
--- a/README.md
+++ b/README.md
@@ -1,90 +1,218 @@
 # NILMTK-Contrib
 
-(Note - This package only works on Python versions <= 3.11)
-
-This repository contains all the state-of-the-art algorithms for the task of energy disaggregation implemented using NILMTK's Rapid Experimentation API. You can find the paper [here](https://doi.org/10.1145/3360322.3360844). All the notebooks that were used to can be found [here](https://github.com/nilmtk/buildsys2019-paper-notebooks).
-
-Using the NILMTK-contrib you can use the following algorithms:
- - Additive Factorial Hidden Markov Model
- - Additive Factorial Hidden Markov Model with Signal Aggregate Constraints
- - Discriminative Sparse Coding
- - RNN
- - Denoising Auto Encoder
- - Seq2Point
- - Seq2Seq
- - WindowGRU
-
-The above state-of-the-art algorithms have been added to this repository. 
-
-You can do the following using the new NILMTK's Rapid Experimentation API:
- - Training and Testing across multiple appliances
- - Training and Testing across multiple datasets (Transfer learning)
- - Training and Testing across multiple buildings
- - Training and Testing with Artificial aggregate
- - Training and Testing with different sampling frequencies
- 
-Refer to this [notebook](https://github.com/nilmtk/nilmtk-contrib/blob/master/sample_notebooks/NILMTK%20API%20Tutorial.ipynb) to know more about the usage of the API.
+NILMTK-Contrib provides NILMTK-compatible implementations of non-intrusive load monitoring (NILM) and energy disaggregation algorithms. The package is designed for use with NILMTK's rapid experimentation API and includes classical, TensorFlow, and PyTorch model backends.
 
-## Citation
+The repository paper is:
 
+Batra et al., "Towards Reproducible State-of-the-Art Energy Disaggregation", BuildSys 2019, DOI: https://doi.org/10.1145/3360322.3360844.
 
-If you find this repo useful for your research, please consider citing our paper:
+## Runtime Requirements
 
-```bibtex
-@inproceedings{10.1145/3360322.3360844,
-author = {Batra, Nipun and Kukunuri, Rithwik and Pandey, Ayush and Malakar, Raktim and Kumar, Rajat and Krystalakos, Odysseas and Zhong, Mingjun and Meira, Paulo and Parson, Oliver},
-title = {Towards Reproducible State-of-the-Art Energy Disaggregation},
-year = {2019},
-isbn = {9781450370059},
-publisher = {Association for Computing Machinery},
-address = {New York, NY, USA},
-url = {https://doi.org/10.1145/3360322.3360844},
-doi = {10.1145/3360322.3360844},
-booktitle = {Proceedings of the 6th ACM International Conference on Systems for Energy-Efficient Buildings, Cities, and Transportation},
-pages = {193–202},
-numpages = {10},
-keywords = {smart meters, energy disaggregation, non-intrusive load monitoring},
-location = {New York, NY, USA},
-series = {BuildSys '19}
-}
-}
+- Python `>=3.11,<3.12`.
+- Install a backend extra before importing or training backend-specific models.
+- NILMTK-compatible datasets are required for real experiments, notebook runs, and benchmark reproduction.
+- Model training and benchmark comparisons should be run in controlled server environments with the relevant backend, dataset, and hardware available.
+
+Python 3.12 and newer are not supported by the current package metadata because TensorFlow and NILMTK compatibility must be verified first.
 
+## Installation
+
+Minimal install for package metadata and lightweight imports:
+
+```bash
+uv pip install git+https://github.com/nilmtk/nilmtk-contrib.git
 ```
-For any enquiries, please contact the main authors.
 
-## Installation Details
+TensorFlow backend:
+
+```bash
+uv pip install "nilmtk-contrib[tensorflow] @ git+https://github.com/nilmtk/nilmtk-contrib.git"
+```
 
-## UV Support
-This Python package uses uv for installation. uv is a fast and modern Python package manager that replaces tools like pip and virtualenv, with support for pyproject.toml and ultra-fast dependency resolution. 
+PyTorch backend:
 
-To install nilmtk_contrib, first install [uv](https://docs.astral.sh/uv/getting-started/installation/) and then run:<br>
+```bash
+uv pip install "nilmtk-contrib[torch] @ git+https://github.com/nilmtk/nilmtk-contrib.git"
 ```
-uv pip install git+https://github.com/nilmtk/nilmtk-contrib.git
+
+Classical backend:
+
+```bash
+uv pip install "nilmtk-contrib[classical] @ git+https://github.com/nilmtk/nilmtk-contrib.git"
 ```
 
-## Docker Support
-Docker is an open-source platform for developing, shipping, and running applications in lightweight, portable containers that bundle code, runtime, libraries, and system tools into a single package. It ensures everyone runs the same environment, regardless of host OS, and keeps nilmtk-contrib’s dependencies contained without polluting the system Python.
+All model backends:
+
+```bash
+uv pip install "nilmtk-contrib[all] @ git+https://github.com/nilmtk/nilmtk-contrib.git"
+```
 
+Development environment:
 
-Build and run locally
+```bash
+uv sync --extra dev
 ```
+
+Backend development examples:
+
+```bash
+uv sync --extra dev --extra torch
+uv sync --extra dev --extra tensorflow
+uv sync --extra dev --extra classical
+```
+
+## Dependency Extras
+
+| Extra | Intended use | Main dependencies |
+|---|---|---|
+| Minimal | Import package metadata and lightweight modules | No required runtime dependencies |
+| `tensorflow` | TensorFlow/Keras disaggregators | NILMTK, NumPy, pandas, scikit-learn, matplotlib, TensorFlow, `tensorflow-io-gcs-filesystem` |
+| `torch` | PyTorch disaggregators | NILMTK, NumPy, pandas, scikit-learn, matplotlib, PyTorch, tqdm |
+| `classical` | AFHMM, AFHMM_SAC, DSC | NILMTK, NumPy, pandas, matplotlib, scikit-learn, SciPy, cvxpy, hmmlearn |
+| `all` | All backends | Union of TensorFlow, PyTorch, classical, and NILMTK dependencies |
+| `dev` | Tests, formatting, and build checks | pytest, pytest-cov, black, ruff, build |
+
+## Models
+
+The table below lists the public model surface. "Verification" describes how the implementation should be cited and interpreted in research use.
+
+| Algorithm | Backend | Import path | Verification | Paper/source | Notes |
+|---|---|---|---|---|---|
+| AFHMM | Classical | `nilmtk_contrib.disaggregate.AFHMM` | NILM paper implementation, not independently benchmark-certified in this package state | Kolter and Jaakkola, AFHMM for energy disaggregation | Requires `classical` extra |
+| AFHMM_SAC | Classical | `nilmtk_contrib.disaggregate.AFHMM_SAC` | NILM paper implementation, not independently benchmark-certified in this package state | Zhong, Goddard, and Sutton, signal aggregate constraints in AFHMMs | Requires `classical` extra |
+| DSC | Classical | `nilmtk_contrib.disaggregate.DSC` | NILM paper implementation, not independently benchmark-certified in this package state | Kolter, Batra, and Ng, discriminative sparse coding | Requires `classical` extra |
+| DAE | TensorFlow | `nilmtk_contrib.disaggregate.DAE` | Neural NILM implementation requiring experiment validation for new claims | Kelly and Knottenbelt, Neural NILM | TensorFlow/Keras backend |
+| DAE | PyTorch | `nilmtk_contrib.torch.DAE` | PyTorch implementation requiring parity validation for new claims | Kelly and Knottenbelt, Neural NILM | PyTorch backend |
+| RNN | TensorFlow | `nilmtk_contrib.disaggregate.RNN` | Neural NILM implementation requiring experiment validation for new claims | Kelly and Knottenbelt, Neural NILM | TensorFlow/Keras backend |
+| RNN | PyTorch | `nilmtk_contrib.torch.RNN` | PyTorch implementation requiring parity validation for new claims | Kelly and Knottenbelt, Neural NILM | PyTorch backend |
+| Seq2Point | TensorFlow | `nilmtk_contrib.disaggregate.Seq2Point` | NILM paper implementation requiring dataset-specific validation | Zhang et al., Sequence-to-Point Learning | TensorFlow/Keras backend |
+| Seq2PointTorch | PyTorch | `nilmtk_contrib.torch.Seq2PointTorch` | PyTorch implementation requiring parity validation for new claims | Zhang et al., Sequence-to-Point Learning | PyTorch backend |
+| Seq2Seq | TensorFlow | `nilmtk_contrib.disaggregate.Seq2Seq` | Legacy NILM baseline adapted from a generic sequence model | Sutskever, Vinyals, and Le, sequence-to-sequence learning | Generic architecture citation |
+| Seq2Seq | PyTorch | `nilmtk_contrib.torch.Seq2Seq` | Legacy NILM baseline adapted from a generic sequence model | Sutskever, Vinyals, and Le, sequence-to-sequence learning | Generic architecture citation |
+| WindowGRU | TensorFlow | `nilmtk_contrib.disaggregate.WindowGRU` | NILM paper implementation requiring experiment validation for new claims | Krystalakos, Nalmpantis, and Vrakas, sliding-window GRU | TensorFlow/Keras backend |
+| WindowGRU | PyTorch | `nilmtk_contrib.torch.WindowGRU` | PyTorch implementation requiring parity validation for new claims | Krystalakos, Nalmpantis, and Vrakas, sliding-window GRU | PyTorch backend |
+| RNN_attention | TensorFlow | `nilmtk_contrib.disaggregate.RNN_attention` | Attention-based NILM implementation | Sudoso and Piccialli, attention-based NILM | TensorFlow/Keras backend |
+| RNN_attention | PyTorch | `nilmtk_contrib.torch.RNN_attention` | PyTorch attention-based NILM implementation | Attention-based NILM literature | PyTorch backend |
+| RNN_attention_classification | TensorFlow | `nilmtk_contrib.disaggregate.RNN_attention_classification` | Attention-based NILM implementation with classification branch | Sudoso and Piccialli, attention-based NILM | Explicit on/off threshold parameters are supported |
+| RNN_attention_classification | PyTorch | `nilmtk_contrib.torch.RNN_attention_classification` | PyTorch attention-based NILM implementation with classification branch | Attention-based NILM literature | Explicit on/off threshold parameters are supported |
+| ResNet | TensorFlow | `nilmtk_contrib.disaggregate.ResNet` | 1D residual NILM adaptation of a generic architecture | He et al., Deep Residual Learning | Generic computer-vision architecture adapted to NILM |
+| ResNet | PyTorch | `nilmtk_contrib.torch.ResNet` | 1D residual NILM adaptation of a generic architecture | He et al., Deep Residual Learning | Generic computer-vision architecture adapted to NILM |
+| ResNet_classification | TensorFlow | `nilmtk_contrib.disaggregate.ResNet_classification` | Residual NILM model with classification branch | Residual and NILM classification literature | Explicit threshold and loss-weight parameters are supported |
+| ResNet_classification | PyTorch | `nilmtk_contrib.torch.ResNet_classification` | Residual NILM model with classification branch | Residual and NILM classification literature | Explicit threshold and loss-weight parameters are supported |
+| BERT | TensorFlow | `nilmtk_contrib.disaggregate.BERT` | Transformer/BERT-inspired NILM adaptation | Devlin et al., BERT | Does not claim NLP-style pretraining |
+| BERT | PyTorch | `nilmtk_contrib.torch.BERT` | Transformer/BERT-inspired NILM adaptation | Devlin et al., BERT | Does not claim NLP-style pretraining |
+| ConvLSTM | PyTorch | `nilmtk_contrib.torch.ConvLSTM` | ConvLSTM-inspired NILM adaptation | Shi et al., ConvLSTM | Generic spatiotemporal architecture adapted to NILM |
+| TCN | PyTorch | `nilmtk_contrib.torch.TCN` | Generic TCN sequence-modeling baseline adapted to NILM | Bai, Kolter, and Koltun, TCN | PyTorch backend |
+| Reformer | PyTorch | `nilmtk_contrib.torch.Reformer` | Reformer-inspired NILM adaptation | Kitaev, Kaiser, and Levskaya, Reformer | Efficient Transformer architecture adapted to NILM |
+| MSDC | PyTorch | `nilmtk_contrib.torch.MSDC` | NILM paper implementation requiring experiment validation for new claims | MSDC dual-CNN NILM paper | Canonical CRF-enabled implementation path |
+| MSDC without CRF | PyTorch | `nilmtk_contrib.torch.msdc_without_crf.MSDC` | MSDC ablation | MSDC paper/source implementation | No-CRF ablation, not the canonical MSDC path |
+| NILMFormer | PyTorch | `nilmtk_contrib.torch.NILMFormer` | NILMFormer implementation requiring experiment validation for new claims | Petralia et al., NILMFormer | PyTorch backend |
+
+## Research Use And Reproducibility
+
+Use the model table to choose the correct backend and citation. Generic architecture papers support architecture inspiration only; they should not be cited as NILM-specific evidence by themselves.
+
+For reproducible experiments:
+
+- Record the Python version, package extras, dataset, building, appliance list, sampling period, random seed, and hardware.
+- Run backend-specific smoke tests before running full experiments.
+- Verify TensorFlow/PyTorch parity before comparing paired implementations.
+- Verify model output lengths and indices before computing NILMTK metrics.
+- Treat notebook outputs as historical examples unless rerun in the current environment.
+
+Recommended fast checks for source validation:
+
+```bash
+python -m compileall -q nilmtk_contrib tests
+python -m pytest -q tests/test_imports.py tests/test_params.py tests/test_preprocessing_windows.py tests/test_preprocessing_alignment.py tests/test_preprocessing_classification.py tests/test_validation.py tests/test_checkpoints.py tests/test_random_logging.py tests/test_model_runtime.py
+python -m build
+```
+
+Backend smoke checks should be run in environments with the corresponding extras by importing the target model classes and running small dataset-specific training or prediction jobs before launching full experiments. For example:
+
+```bash
+uv sync --extra dev --extra torch
+python -m pytest -q
+```
+
+## Reference Papers And Codebases
+
+NILM-specific references:
+
+- Kolter and Jaakkola, "Approximate Inference in Additive Factorial HMMs with Application to Energy Disaggregation", AISTATS 2012, https://proceedings.mlr.press/v22/zico12.html.
+- Zhong, Goddard, and Sutton, "Signal Aggregate Constraints in Additive Factorial HMMs, with Application to Energy Disaggregation", NeurIPS 2014, https://papers.nips.cc/paper/5526-signal-aggregate-constraints-in-additive-factorial-hmms-with-application-to-energy-disaggregation.
+- Kolter, Batra, and Ng, "Energy Disaggregation via Discriminative Sparse Coding", NeurIPS 2010, https://papers.nips.cc/paper/4054-energy-disaggregation-via-discriminative-sparse-coding.
+- Kelly and Knottenbelt, "Neural NILM: Deep Neural Networks Applied to Energy Disaggregation", arXiv:1507.06594, https://arxiv.org/abs/1507.06594.
+- Zhang et al., "Sequence-to-Point Learning With Neural Networks for Non-Intrusive Load Monitoring", AAAI 2018, DOI: https://doi.org/10.1609/aaai.v32i1.11873.
+- Krystalakos, Nalmpantis, and Vrakas, "Sliding Window Approach for Online Energy Disaggregation Using Artificial Neural Networks", DOI: https://doi.org/10.1145/3200947.3201011.
+- Sudoso and Piccialli, "Non-Intrusive Load Monitoring with an Attention-based Deep Neural Network", arXiv:1912.00759, https://arxiv.org/abs/1912.00759.
+- MSDC, "Exploiting Multi-State Power Consumption in Non-intrusive Load Monitoring based on A Dual-CNN Model", arXiv:2302.05565, https://arxiv.org/abs/2302.05565.
+- Petralia et al., "NILMFormer: Non-Intrusive Load Monitoring that Accounts for Non-Stationarity", arXiv:2506.05880, https://arxiv.org/abs/2506.05880.
+
+Generic architecture references:
+
+- Sutskever, Vinyals, and Le, "Sequence to Sequence Learning with Neural Networks", arXiv:1409.3215, https://arxiv.org/abs/1409.3215.
+- He et al., "Deep Residual Learning for Image Recognition", arXiv:1512.03385, https://arxiv.org/abs/1512.03385.
+- Devlin et al., "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", arXiv:1810.04805, https://arxiv.org/abs/1810.04805.
+- Shi et al., "Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting", arXiv:1506.04214, https://arxiv.org/abs/1506.04214.
+- Bai, Kolter, and Koltun, "An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling", arXiv:1803.01271, https://arxiv.org/abs/1803.01271.
+- Kitaev, Kaiser, and Levskaya, "Reformer: The Efficient Transformer", arXiv:2001.04451, https://arxiv.org/abs/2001.04451.
+
+Reference repositories:
+
+- Attention-NILM: https://github.com/antoniosudoso/attention-nilm.
+- NILMFormer: https://github.com/adrienpetralia/NILMFormer.
+- TCN: https://github.com/locuslab/TCN.
+
+## Usage
+
+The sample notebooks under [sample_notebooks](sample_notebooks) demonstrate the NILMTK rapid experimentation API. Install the relevant backend extra and ensure datasets are available before running them.
+
+Supported experiment workflows include:
+
+- Training and testing across multiple appliances.
+- Training and testing across multiple datasets for transfer learning.
+- Training and testing across multiple buildings.
+- Training and testing with artificial aggregate.
+- Training and testing with different sampling frequencies.
+
+## Docker
+
+Build and run locally:
+
+```bash
 docker build -t nilmtk-contrib .
 docker run --rm -it nilmtk-contrib bash
 ```
-Pull the pre-built image
-```
+
+The default Dockerfile installs `.[all]`. Edit the Dockerfile to use `.[torch]`, `.[tensorflow]`, or `.[classical]` for a narrower backend image.
+
+Pull the pre-built image:
+
+```bash
 docker pull ghcr.io/enfuego27826/nilmtk-contrib:latest
 docker run --rm -it ghcr.io/enfuego27826/nilmtk-contrib:latest bash
 ```
 
-Refer to this [notebook](https://github.com/nilmtk/nilmtk-contrib/tree/master/sample_notebooks) for using the nilmtk-contrib algorithms, using the new NILMTK-API.
-
-## Dependencies
-
-- NILMTK>=0.4
-- scikit-learn>=0.21 (already required by NILMTK)
-- Tensorflow >= 2.12.0 < 2.16.0 
-- cvxpy>=1.0.0
+## Citation
 
-**Note: For faster computation of neural networks, it is suggested that you install keras-gpu, since it can take advantage of GPUs. The algorithms AFHMM, AFHMM_SAC and DSC are CPU intensive, use a system with good CPU for these algorithms.**
+If you find this repository useful for your research, please cite:
 
+```bibtex
+@inproceedings{10.1145/3360322.3360844,
+author = {Batra, Nipun and Kukunuri, Rithwik and Pandey, Ayush and Malakar, Raktim and Kumar, Rajat and Krystalakos, Odysseas and Zhong, Mingjun and Meira, Paulo and Parson, Oliver},
+title = {Towards Reproducible State-of-the-Art Energy Disaggregation},
+year = {2019},
+isbn = {9781450370059},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3360322.3360844},
+doi = {10.1145/3360322.3360844},
+booktitle = {Proceedings of the 6th ACM International Conference on Systems for Energy-Efficient Buildings, Cities, and Transportation},
+pages = {193-202},
+numpages = {10},
+keywords = {smart meters, energy disaggregation, non-intrusive load monitoring},
+location = {New York, NY, USA},
+series = {BuildSys '19}
+}
+```
diff --git a/nilmtk_contrib/__init__.py b/nilmtk_contrib/__init__.py
index 662b7f6..43efbbb 100644
--- a/nilmtk_contrib/__init__.py
+++ b/nilmtk_contrib/__init__.py
@@ -1,8 +1,3 @@
-from . import disaggregate
 from .version import version as __version__
-import pandas as pd
 
-if not hasattr(pd.DataFrame, "append"):
-    def _df_append(self, other, ignore_index=False, verify_integrity=False, sort=False):
-        return pd.concat([self, other], ignore_index=ignore_index, verify_integrity=verify_integrity, sort=sort)
-    pd.DataFrame.append = _df_append
+__all__ = ["__version__"]
diff --git a/nilmtk_contrib/disaggregate/WindowGRU.py b/nilmtk_contrib/disaggregate/WindowGRU.py
index 3aa1d1c..2490b98 100644
--- a/nilmtk_contrib/disaggregate/WindowGRU.py
+++ b/nilmtk_contrib/disaggregate/WindowGRU.py
@@ -7,9 +7,15 @@
 from tensorflow.keras.models import Sequential
 
 
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+from nilmtk_contrib.utils.validation import train_validation_split
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 class WindowGRU(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "tensorflow"))
 
         self.MODEL_NAME = "WindowGRU"
         self.file_prefix = "{}-temp-weights".format(self.MODEL_NAME.lower())
@@ -37,28 +43,30 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, curre
         train_appliances = new_train_appliances
         for app_name, app_df in train_appliances:
             if app_name not in self.models:
-                print("First model training for", app_name)
+                _log_print("First model training for", app_name)
                 self.models[app_name] = self.return_network()
             else:
-                print("Started re-training model for", app_name)
+                _log_print("Started re-training model for", app_name)
 
             model = self.models[app_name]
             mains = train_main.reshape((-1,self.sequence_length,1))
             app_reading = app_df.reshape((-1,1))
-            filepath = self.file_prefix + "-{}-epoch{}.h5".format(
-                    "_".join(app_name.split()),
-                    current_epoch,
-            )
-            checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1,save_best_only=True,mode='min')
+            filepath = checkpoint_path(".h5")
+            checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1 if self.verbose else 0,save_best_only=True,mode='min')
+            split = train_validation_split(mains, app_reading, validation_fraction=0.15, strategy='tail', allow_no_validation=True)
+            if not split.metadata.should_train:
+                continue
             model.fit(
-                    mains, app_reading,
-                    validation_split=.15,
+                    split.X_train, split.y_train,
+                    validation_data=(split.X_val, split.y_val) if split.metadata.validation_enabled else None,
                     epochs=self.n_epochs,
                     batch_size=self.batch_size,
-                    callbacks=[ checkpoint ],
+                    callbacks=[checkpoint] if split.metadata.validation_enabled else [],
                     shuffle=True,
+                    verbose=1 if self.verbose else 0,
             )
-            model.load_weights(filepath)
+            if split.metadata.validation_enabled and filepath.exists():
+                model.load_weights(filepath)
 
     def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
 
@@ -86,9 +94,8 @@ def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
         return test_predictions
 
     def call_preprocessing(self, mains_lst, submeters_lst, method):
-        max_val = self.max_val
         if method == 'train':
-            print("Training processing")
+            _log_print("Training processing")
             processed_mains = []
 
             for mains in mains_lst:
diff --git a/nilmtk_contrib/disaggregate/__init__.py b/nilmtk_contrib/disaggregate/__init__.py
index 9d560df..1ce1ca4 100644
--- a/nilmtk_contrib/disaggregate/__init__.py
+++ b/nilmtk_contrib/disaggregate/__init__.py
@@ -1,14 +1,81 @@
-from nilmtk.disaggregate import Disaggregator
-from .dae import DAE
-from .dsc import DSC
-from .afhmm import AFHMM
-from .afhmm_sac import AFHMM_SAC
-from .seq2point import Seq2Point
-from .seq2seq import Seq2Seq
-from .WindowGRU import WindowGRU
-from .rnn import RNN
-from .rnn_attention import RNN_attention
-from .rnn_attention_classification import RNN_attention_classification
-from .resnet import ResNet
-from .resnet_classification import ResNet_classification
-from .bert import BERT
\ No newline at end of file
+"""Lazy exports for TensorFlow and classical NILMTK disaggregators.
+
+These classes require optional backend dependencies. Importing this package does
+not import TensorFlow, cvxpy, hmmlearn, or NILMTK until a class is requested.
+"""
+
+from importlib import import_module
+
+from nilmtk_contrib.utils.optional_imports import OptionalDependencyError
+
+_EXPORTS = {
+    "AFHMM": ("nilmtk_contrib.disaggregate.afhmm", "classical", "AFHMM"),
+    "AFHMM_SAC": ("nilmtk_contrib.disaggregate.afhmm_sac", "classical", "AFHMM_SAC"),
+    "BERT": ("nilmtk_contrib.disaggregate.bert", "tensorflow", "BERT"),
+    "DAE": ("nilmtk_contrib.disaggregate.dae", "tensorflow", "DAE"),
+    "DSC": ("nilmtk_contrib.disaggregate.dsc", "classical", "DSC"),
+    "RNN": ("nilmtk_contrib.disaggregate.rnn", "tensorflow", "RNN"),
+    "RNN_attention": (
+        "nilmtk_contrib.disaggregate.rnn_attention",
+        "tensorflow",
+        "RNN_attention",
+    ),
+    "RNN_attention_classification": (
+        "nilmtk_contrib.disaggregate.rnn_attention_classification",
+        "tensorflow",
+        "RNN_attention_classification",
+    ),
+    "ResNet": ("nilmtk_contrib.disaggregate.resnet", "tensorflow", "ResNet"),
+    "ResNet_classification": (
+        "nilmtk_contrib.disaggregate.resnet_classification",
+        "tensorflow",
+        "ResNet_classification",
+    ),
+    "Seq2Point": ("nilmtk_contrib.disaggregate.seq2point", "tensorflow", "Seq2Point"),
+    "Seq2Seq": ("nilmtk_contrib.disaggregate.seq2seq", "tensorflow", "Seq2Seq"),
+    "WindowGRU": ("nilmtk_contrib.disaggregate.WindowGRU", "tensorflow", "WindowGRU"),
+}
+
+_DEPENDENCY_EXTRAS = {
+    "cvxpy": "classical",
+    "hmmlearn": "classical",
+    "nilmtk": "nilm",
+    "sklearn": "classical",
+    "tensorflow": "tensorflow",
+}
+
+__all__ = sorted([*_EXPORTS, "Disaggregator"])
+
+
+def __getattr__(name):
+    if name == "Disaggregator":
+        try:
+            module = import_module("nilmtk.disaggregate")
+        except ModuleNotFoundError as exc:
+            message = (
+                "Disaggregator requires 'nilmtk'. "
+                "Install nilmtk-contrib[nilm]."
+            )
+            raise OptionalDependencyError(message) from exc
+        value = module.Disaggregator
+        globals()[name] = value
+        return value
+
+    if name not in _EXPORTS:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+    module_name, extra_name, purpose = _EXPORTS[name]
+    try:
+        module = import_module(module_name)
+    except ModuleNotFoundError as exc:
+        missing_package = exc.name or "required dependency"
+        install_extra = _DEPENDENCY_EXTRAS.get(missing_package, extra_name)
+        message = (
+            f"{purpose} requires '{missing_package}'. "
+            f"Install nilmtk-contrib[{install_extra}]."
+        )
+        raise OptionalDependencyError(message) from exc
+
+    value = getattr(module, name)
+    globals()[name] = value
+    return value
diff --git a/nilmtk_contrib/disaggregate/afhmm.py b/nilmtk_contrib/disaggregate/afhmm.py
index ad16433..09cbf07 100644
--- a/nilmtk_contrib/disaggregate/afhmm.py
+++ b/nilmtk_contrib/disaggregate/afhmm.py
@@ -7,9 +7,16 @@
 from hmmlearn import hmm
 from multiprocessing import Process, Manager
 
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger
+from nilmtk_contrib.utils.params import validate_positive_int
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 class AFHMM(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy"))
+        super().__init__()
         self.model = []
         self.MODEL_NAME = 'AFHMM'        
         self.models = []
@@ -19,12 +26,35 @@ def __init__(self, params):
         self.time_period = 720
         self.time_period = params.get('time_period', self.time_period)
         self.default_num_states = params.get('default_num_states',2)
+        self.time_period = validate_positive_int("time_period", self.time_period)
+        self.default_num_states = validate_positive_int("default_num_states", self.default_num_states)
+        if self.default_num_states < 2:
+            raise ValueError("default_num_states must be at least 2.")
+        self.max_workers = params.get("max_workers")
+        if self.max_workers is not None:
+            self.max_workers = validate_positive_int("max_workers", self.max_workers)
+        self.solver = params.get("solver", cvx.SCS)
+        self.max_iters = params.get("max_iters")
+        self.eps = params.get("eps")
+        self.warm_start = params.get("warm_start", True)
         self.save_model_path = params.get('save-model-path', None)
         self.load_model_path = params.get('pretrained-model-path',None)
         self.chunk_wise_training =  False
         if self.load_model_path:
             self.load_model(self.load_model_path)
 
+    def _solve_problem(self, problem):
+        solve_kwargs = {
+            "solver": self.solver,
+            "verbose": self.verbose,
+            "warm_start": self.warm_start,
+        }
+        if self.max_iters is not None:
+            solve_kwargs["max_iters"] = self.max_iters
+        if self.eps is not None:
+            solve_kwargs["eps"] = self.eps
+        return problem.solve(**solve_kwargs)
+
 
     def partial_fit(self, train_main, train_appliances, **load_kwargs):
         
@@ -41,14 +71,13 @@ def partial_fit(self, train_main, train_appliances, **load_kwargs):
         train_appliances = train_app_tmp
         learnt_model = OrderedDict()
         means_vector = []
-        one_hot_states_vector = []
         pi_s_vector = []
         transmat_vector = []
         states_vector = []
         train_main = train_main.values.flatten().reshape((-1,1))
 
         for appliance_name, power in train_appliances:
-            #print (appliance_name)
+            #_log_print(appliance_name)
             # Learning the pi's and transistion probabliites  for each appliance using a simple HMM
             self.appliances.append(appliance_name)    
             X = power.values.reshape((-1,1))
@@ -70,8 +99,7 @@ def partial_fit(self, train_main, train_appliances, **load_kwargs):
             for i in keys:
                 pi.append(counter[i]/total)
             pi = np.array(pi)
-            nb_classes = self.default_num_states
-            targets = states.reshape(-1)
+            states.reshape(-1)
             means_vector.append(means)
             pi_s_vector.append(pi)
             transmat_vector.append(transmat.T)
@@ -83,7 +111,7 @@ def partial_fit(self, train_main, train_appliances, **load_kwargs):
         self.pi_s_vector = pi_s_vector
         self.means_vector = means_vector
         self.transmat_vector = transmat_vector
-        print ("Finished Training")
+        _log_print("Finished Training")
 
     def disaggregate_thread(self, test_mains,index,d):
 
@@ -96,10 +124,13 @@ def disaggregate_thread(self, test_mains,index,d):
 
         sigma = 100*np.ones((len(test_mains),1))
         flag = 0
+        s_ = None
 
         for epoch in range(6):
             # The alernative Minimization
             if epoch%2==1:
+                if s_ is None:
+                    raise RuntimeError(f"{self.MODEL_NAME} solver did not produce appliance states.")
                 usage = np.zeros((len(test_mains)))
                 for appliance_id in range(self.num_appliances):
                     app_usage= np.sum(s_[appliance_id]@means_vector[appliance_id],axis=1)
@@ -112,7 +143,7 @@ def disaggregate_thread(self, test_mains,index,d):
                     constraints = []
                     cvx_state_vectors = []
                     cvx_variable_matrices = []
-                    delta = cvx.Variable(shape=(len(test_mains),1), name='delta_t')
+                    cvx.Variable(shape=(len(test_mains),1), name='delta_t')
                     for appliance_id in range(self.num_appliances):
                             state_vector = cvx.Variable(shape=(len(test_mains), self.default_num_states), name='state_vec-%s'%(appliance_id))                    
                             cvx_state_vectors.append(state_vector)
@@ -177,7 +208,7 @@ def disaggregate_thread(self, test_mains,index,d):
                 expression = term_1 + term_2 + term_3 + term_4
                 expression = cvx.Minimize(expression)
                 prob = cvx.Problem(expression, constraints,)                
-                prob.solve(solver=cvx.SCS,verbose=False,warm_start=True)
+                self._solve_problem(prob)
                 s_ = [i.value for i in cvx_state_vectors]
 
         prediction_dict = {}
@@ -193,11 +224,11 @@ def disaggregate_thread(self, test_mains,index,d):
     def disaggregate_chunk(self, test_mains_list):
 
         # Sistributes the test mains across multiple threads and runs them in parallel
-        manager = Manager()
-        d = manager.dict()
-        
         predictions_lst = []
-        for test_mains in test_mains_list:        
+        for test_mains in test_mains_list:
+            original_length = len(test_mains)
+            manager = Manager()
+            d = manager.dict()
             test_mains_big = test_mains.values.flatten().reshape((-1,1))
             self.arr_of_results = []        
             threads = []
@@ -206,15 +237,24 @@ def disaggregate_chunk(self, test_mains_list):
                 t = Process(target=self.disaggregate_thread, args=(test_mains,test_block,d))
                 threads.append(t)
 
-            for t in threads:
-                t.start()
-
-            for t in threads:
-                t.join()
+            worker_limit = self.max_workers or len(threads) or 1
+            for start in range(0, len(threads), worker_limit):
+                active_threads = threads[start:start + worker_limit]
+                for t in active_threads:
+                    t.start()
+                for t in active_threads:
+                    t.join()
+                    if t.exitcode != 0:
+                        raise RuntimeError(
+                            f"{self.MODEL_NAME} worker failed with exit code {t.exitcode}."
+                        )
 
             for i in range(len(threads)):
+                if i not in d:
+                    raise RuntimeError(f"{self.MODEL_NAME} worker {i} did not return results.")
                 self.arr_of_results.append(d[i])
             prediction = pd.concat(self.arr_of_results,axis=0)
+            prediction = prediction.iloc[:original_length]
             predictions_lst.append(prediction)
             
         return predictions_lst
diff --git a/nilmtk_contrib/disaggregate/afhmm_sac.py b/nilmtk_contrib/disaggregate/afhmm_sac.py
index 1e87b27..c8e1ec6 100644
--- a/nilmtk_contrib/disaggregate/afhmm_sac.py
+++ b/nilmtk_contrib/disaggregate/afhmm_sac.py
@@ -7,10 +7,17 @@
 from hmmlearn import hmm
 from multiprocessing import Process, Manager
 
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger
+from nilmtk_contrib.utils.params import validate_positive_int
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 class AFHMM_SAC(Disaggregator):
     """1 dimensional baseline Mean algorithm."""
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy"))
+        super().__init__()
         self.model = []
         self.MIN_CHUNK_LENGTH = 100
         self.MODEL_NAME = 'AFHMM_SAC'
@@ -22,12 +29,36 @@ def __init__(self, params):
         self.signal_aggregates = OrderedDict()
         self.time_period = params.get('time_period', self.time_period)
         self.default_num_states = params.get('default_num_states',2)
+        self.time_period = validate_positive_int("time_period", self.time_period)
+        self.default_num_states = validate_positive_int("default_num_states", self.default_num_states)
+        if self.default_num_states < 2:
+            raise ValueError("default_num_states must be at least 2.")
+        self.max_workers = params.get("max_workers")
+        if self.max_workers is not None:
+            self.max_workers = validate_positive_int("max_workers", self.max_workers)
+        self.solver = params.get("solver", cvx.SCS)
+        self.max_iters = params.get("max_iters")
+        self.eps = params.get("eps")
+        self.warm_start = params.get("warm_start", True)
+        self.sac_strength = params.get("sac_strength", 1.0)
         self.save_model_path = params.get('save-model-path', None)
         self.load_model_path = params.get('pretrained-model-path',None)
         self.chunk_wise_training = False
         if self.load_model_path:
             self.load_model(self.load_model_path)
 
+    def _solve_problem(self, problem):
+        solve_kwargs = {
+            "solver": self.solver,
+            "verbose": self.verbose,
+            "warm_start": self.warm_start,
+        }
+        if self.max_iters is not None:
+            solve_kwargs["max_iters"] = self.max_iters
+        if self.eps is not None:
+            solve_kwargs["eps"] = self.eps
+        return problem.solve(**solve_kwargs)
+
 
 
     def partial_fit(self, train_main, train_appliances, **load_kwargs):
@@ -53,7 +84,6 @@ def partial_fit(self, train_main, train_appliances, **load_kwargs):
 
         means_vector = []
 
-        one_hot_states_vector = []
 
         pi_s_vector = []
 
@@ -64,7 +94,7 @@ def partial_fit(self, train_main, train_appliances, **load_kwargs):
         train_main = train_main.values.flatten().reshape((-1,1))
         
         for appliance_name, power in train_appliances:
-            #print (appliance_name)
+            #_log_print(appliance_name)
             self.appliances.append(appliance_name)
             
             X = power.values.reshape((-1,1))
@@ -91,8 +121,7 @@ def partial_fit(self, train_main, train_appliances, **load_kwargs):
             
             pi = np.array(pi)
 
-            nb_classes = self.default_num_states
-            targets = states.reshape(-1)
+            states.reshape(-1)
             
             means_vector.append(means)
             pi_s_vector.append(pi)
@@ -106,28 +135,26 @@ def partial_fit(self, train_main, train_appliances, **load_kwargs):
         self.means_vector = means_vector
         self.transmat_vector = transmat_vector
 
-#         print(transmat_vector)
-#         print (means_vector)
-#         print (states_vector)
-#         print (pi_s_vector)
-        print ("Finished Training")
-#         print (self.signal_aggregates)
-#        print (np.log(transmat))
-#        print(pi)
-#        print (np.log(pi))
-        #print (np.sum(transmat_vector[0],axis=1))
-        #print (np.sum(transmat_vector[0],axis=0))
-            #print (states.shape)
-            #print (one_hot_targets.shape)
+#         _log_print(means_vector)
+#         _log_print(states_vector)
+#         _log_print(pi_s_vector)
+        _log_print("Finished Training")
+#         _log_print(self.signal_aggregates)
+#        _log_print(np.log(transmat))
+#        _log_print(np.log(pi))
+        #_log_print(np.sum(transmat_vector[0],axis=1))
+        #_log_print(np.sum(transmat_vector[0],axis=0))
+            #_log_print(states.shape)
+            #_log_print(one_hot_targets.shape)
 
         # one_hot_states_vector = np.array(one_hot_states_vector)
 
-        # # print (transmat_vector[0])
-        # # print (np.sum(transmat_vector[0],axis=0))
-        # # print (np.sum(transmat_vector[0],axis=1))
+        # # _log_print(transmat_vector[0])
+        # # _log_print(np.sum(transmat_vector[0],axis=0))
+        # # _log_print(np.sum(transmat_vector[0],axis=1))
         # appliance_variable_matrix = []
 
-        # #print (len(states_vector))
+        # #_log_print(len(states_vector))
         # #variable_matrix = np.zeros((len(appliance_states),self.default_num_states,self.default_num_states))
 
         # for appliance_states in states_vector:
@@ -153,8 +180,11 @@ def disaggregate_thread(self, test_mains,index,d):
         transmat_vector = self.transmat_vector
         sigma = 100*np.ones((len(test_mains),1))
         flag = 0
+        s_ = None
         for epoch in range(6):
             if epoch%2==1:
+                if s_ is None:
+                    raise RuntimeError(f"{self.MODEL_NAME} solver did not produce appliance states.")
                 # The alernative Minimization
                 usage = np.zeros((len(test_mains)))
                 for appliance_id in range(self.num_appliances):
@@ -168,7 +198,7 @@ def disaggregate_thread(self, test_mains,index,d):
                     constraints = []
                     cvx_state_vectors = []
                     cvx_variable_matrices = []
-                    delta = cvx.Variable(shape=(len(test_mains),1), name='delta_t')
+                    cvx.Variable(shape=(len(test_mains),1), name='delta_t')
 
                     for appliance_id in range(self.num_appliances):
                             state_vector = cvx.Variable(shape=(len(test_mains), self.default_num_states), name='state_vec-%s'%(appliance_id))                    
@@ -202,13 +232,17 @@ def disaggregate_thread(self, test_mains,index,d):
                     for appliance_id in range(self.num_appliances):
                         appliance_usage = cvx_state_vectors[appliance_id]@means_vector[appliance_id]
                         total_appliance_usage = cvx.sum(appliance_usage)
-                        constraints+=[total_appliance_usage <= self.signal_aggregates[self.appliances[appliance_id]]]
+                        aggregate_limit = (
+                            self.sac_strength
+                            * self.signal_aggregates[self.appliances[appliance_id]]
+                        )
+                        constraints+=[total_appliance_usage <= aggregate_limit]
 
 
                     # Second order cone constraints
                     
                     total_observed_reading = np.zeros((test_mains.shape))
-                        #print (len(cvx_state_vectors))
+                        #_log_print(len(cvx_state_vectors))
                     for appliance_id in range(self.num_appliances):
                                 total_observed_reading+=cvx_state_vectors[appliance_id]@means_vector[appliance_id]                    
                     flag=1
@@ -243,7 +277,7 @@ def disaggregate_thread(self, test_mains,index,d):
                 expression = cvx.Minimize(expression)
                 prob = cvx.Problem(expression, constraints)
 
-                prob.solve(solver=cvx.SCS,verbose=False, warm_start=True)
+                self._solve_problem(prob)
                 s_ = [i.value for i in cvx_state_vectors]
 
         prediction_dict = {}
@@ -262,12 +296,12 @@ def disaggregate_thread(self, test_mains,index,d):
 
     def disaggregate_chunk(self, test_mains_list):
 
-        # Sistributes the test mains across multiple threads and runs them in parallel
-        manager = Manager()
-        d = manager.dict()
-        
+        # Distributes the test mains across multiple workers and runs them in parallel.
         predictions_lst = []
-        for test_mains in test_mains_list:        
+        for test_mains in test_mains_list:
+            original_length = len(test_mains)
+            manager = Manager()
+            d = manager.dict()
             test_mains_big = test_mains.values.flatten().reshape((-1,1))
             self.arr_of_results = []        
             threads = []
@@ -276,17 +310,24 @@ def disaggregate_chunk(self, test_mains_list):
                 t = Process(target=self.disaggregate_thread, args=(test_mains,test_block,d))
                 threads.append(t)
 
-            for t in threads:
-                t.start()
-
-            for t in threads:
-                t.join()
+            worker_limit = self.max_workers or len(threads) or 1
+            for start in range(0, len(threads), worker_limit):
+                active_threads = threads[start:start + worker_limit]
+                for t in active_threads:
+                    t.start()
+                for t in active_threads:
+                    t.join()
+                    if t.exitcode != 0:
+                        raise RuntimeError(
+                            f"{self.MODEL_NAME} worker failed with exit code {t.exitcode}."
+                        )
 
             for i in range(len(threads)):
+                if i not in d:
+                    raise RuntimeError(f"{self.MODEL_NAME} worker {i} did not return results.")
                 self.arr_of_results.append(d[i])
             prediction = pd.concat(self.arr_of_results,axis=0)
+            prediction = prediction.iloc[:original_length]
             predictions_lst.append(prediction)
-            
-        return predictions_lst
 
- 
\ No newline at end of file
+        return predictions_lst
diff --git a/nilmtk_contrib/disaggregate/bert.py b/nilmtk_contrib/disaggregate/bert.py
index ec6ce04..0cd076e 100644
--- a/nilmtk_contrib/disaggregate/bert.py
+++ b/nilmtk_contrib/disaggregate/bert.py
@@ -1,26 +1,20 @@
 from __future__ import print_function, division
-from warnings import warn
 
 from nilmtk.disaggregate import Disaggregator
-from tensorflow.keras.layers import Conv1D, Dense, Dropout, Reshape, Flatten,Input,GlobalAveragePooling1D
-from tensorflow.keras.layers import AveragePooling1D
-import os
+from tensorflow.keras.layers import Conv1D, Dense, Dropout, Flatten
 import pandas as pd
 import numpy as np
-import pickle
 from collections import OrderedDict
 
-from tensorflow.keras.optimizers import SGD
-from tensorflow.keras.models import Sequential, load_model
+from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Layer,MultiHeadAttention,LayerNormalization,Embedding
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
+from nilmtk_contrib.utils.validation import safe_train_test_split as train_test_split
 from tensorflow.keras.callbacks import ModelCheckpoint
-import tensorflow.keras.backend as K
-import random
-random.seed(10)
-np.random.seed(10)
 import tensorflow as tf
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 gpus=tf.config.experimental.list_physical_devices("GPU")
 for gpu in gpus:
     tf.config.experimental.set_memory_growth(gpu,True)
@@ -109,6 +103,7 @@ def get_config(self):
 class BERT(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "tensorflow"))
 
         self.MODEL_NAME = "BERT"
         self.chunk_wise_training = params.get('chunk_wise_training',False)
@@ -120,12 +115,12 @@ def __init__(self, params):
         self.batch_size = params.get('batch_size',512)
         self.appliance_params = params.get('appliance_params',{})
         if self.sequence_length%2==0:
-            print ("Sequence length should be odd!")
+            _log_print("Sequence length should be odd!")
             raise (SequenceLengthError)
 
     def partial_fit(self,train_main,train_appliances,do_preprocessing=True,**load_kwargs):
 
-        print("...............BERT partial_fit running...............")
+        _log_print("...............BERT partial_fit running...............")
         if len(self.appliance_params) == 0:
             self.set_appliance_params(train_appliances)
 
@@ -144,17 +139,17 @@ def partial_fit(self,train_main,train_appliances,do_preprocessing=True,**load_kw
 
         for appliance_name, power in train_appliances:
             if appliance_name not in self.models:
-                print("First model training for ", appliance_name)
+                _log_print("First model training for ", appliance_name)
                 self.models[appliance_name] = self.return_network()
             else:
-                print("Started Retraining model for ", appliance_name)
+                _log_print("Started Retraining model for ", appliance_name)
 
             model = self.models[appliance_name]
             if train_main.size > 0:
                 # Sometimes chunks can be empty after dropping NANS
                 if len(train_main) > 10:
                     # Do validation when you have sufficient samples
-                    filepath = 'BERT-temp-weights-'+str(random.randint(0,100000))+'.h5'
+                    filepath = checkpoint_path(".h5")
                     checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1,save_best_only=True,mode='min')
                     train_x, v_x, train_y, v_y = train_test_split(train_main, power, test_size=.15,random_state=10)
                     model.fit(train_x,train_y,validation_data=(v_x,v_y),epochs=self.n_epochs,callbacks=[checkpoint],batch_size=self.batch_size)
@@ -187,14 +182,14 @@ def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
                 # the sum_arr keeps the number of times a particular timestamp has occured
                 # the predictions are summed for  agiven time, and is divided by the number of times it has occured
                 
-                l = self.sequence_length
-                n = len(prediction) + l - 1
+                window_length = self.sequence_length
+                n = len(prediction) + window_length - 1
                 sum_arr = np.zeros((n))
                 counts_arr = np.zeros((n))
-                o = len(sum_arr)
+                len(sum_arr)
                 for i in range(len(prediction)):
-                    sum_arr[i:i + l] += prediction[i].flatten()
-                    counts_arr[i:i + l] += 1
+                    sum_arr[i:i + window_length] += prediction[i].flatten()
+                    counts_arr[i:i + window_length] += 1
                 for i in range(len(sum_arr)):
                     sum_arr[i] = sum_arr[i] / counts_arr[i]
 
@@ -253,7 +248,7 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                     app_mean = self.appliance_params[app_name]['mean']
                     app_std = self.appliance_params[app_name]['std']
                 else:
-                    print ("Parameters for ", app_name ," were not found!")
+                    _log_print("Parameters for ", app_name ," were not found!")
                     raise ApplianceNotFoundError()
 
 
@@ -287,9 +282,9 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
     def set_appliance_params(self,train_appliances):
 
         for (app_name,df_list) in train_appliances:
-            l = np.array(pd.concat(df_list,axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
+            values = np.array(pd.concat(df_list,axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
             if app_std<1:
                 app_std = 100
             self.appliance_params.update({app_name:{'mean':app_mean,'std':app_std}})
diff --git a/nilmtk_contrib/disaggregate/dae.py b/nilmtk_contrib/disaggregate/dae.py
index 835e436..6fbf78d 100644
--- a/nilmtk_contrib/disaggregate/dae.py
+++ b/nilmtk_contrib/disaggregate/dae.py
@@ -1,22 +1,33 @@
-from warnings import warn
 from nilmtk.disaggregate import Disaggregator
-from tensorflow.keras.layers import Conv1D, Dense, Dropout, Reshape, Flatten
+from tensorflow.keras.layers import Conv1D, Dense, Reshape, Flatten
 import pandas as pd
 import numpy as np
 from collections import OrderedDict 
-from tensorflow.keras.optimizers import SGD
 from tensorflow.keras.models import Sequential
-import matplotlib.pyplot as  plt
 from tensorflow.keras.callbacks import ModelCheckpoint
-import tensorflow.keras.backend as K
-from statistics import mean
 import os
 import json
+from nilmtk_contrib.utils.checkpoints import (
+    build_metadata,
+    collect_dependencies,
+    load_keras_weights,
+    load_metadata,
+    save_keras_weights,
+    save_metadata,
+    temporary_checkpoint,
+)
+from nilmtk_contrib.utils.logging import get_logger
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print
+from nilmtk_contrib.utils.random import set_random_seed
+from nilmtk_contrib.utils.validation import train_validation_split
 
+logger = get_logger(__name__)
+_log_print = legacy_print(logger)
 
 class DAE(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "tensorflow"))
         """
         Iniititalize the moel with the given parameters
         """
@@ -31,7 +42,10 @@ def __init__(self, params):
         self.appliance_params = params.get('appliance_params',{})
         self.save_model_path = params.get('save-model-path', None)
         self.load_model_path = params.get('pretrained-model-path',None)
+        self.seed = params.get('seed', None)
+        self.verbose = params.get('verbose', False)
         self.models = OrderedDict()
+        set_random_seed(self.seed, backends=("python", "numpy", "tensorflow"))
         if self.load_model_path:
             self.load_model()
 
@@ -47,7 +61,7 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, curre
 
         # To preprocess the data and bring it to a valid shape
         if do_preprocessing:
-            print ("Preprocessing")
+            logger.info("Preprocessing")
             train_main, train_appliances = self.call_preprocessing(train_main, train_appliances, 'train')
         train_main = pd.concat(train_main, axis=0).values
         train_main = train_main.reshape((-1, self.sequence_length, 1))
@@ -60,36 +74,73 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, curre
         train_appliances = new_train_appliances
         for appliance_name, power in train_appliances:
             if appliance_name not in self.models:
-                print("First model training for", appliance_name)
+                logger.info("First model training for %s.", appliance_name)
                 self.models[appliance_name] = self.return_network()
-                print(self.models[appliance_name].summary())
+                if self.verbose:
+                    self.models[appliance_name].summary()
 
-            print("Started Retraining model for", appliance_name)
+            logger.info("Started retraining model for %s.", appliance_name)
             model = self.models[appliance_name]
-            filepath = self.file_prefix + "-{}-epoch{}.h5".format(
-                    "_".join(appliance_name.split()),
-                    current_epoch,
+            split = train_validation_split(
+                    train_main,
+                    power,
+                    validation_fraction=0.15,
+                    strategy="tail",
+                    min_train=1,
+                    min_val=1,
+                    allow_no_validation=True,
             )
-            checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
-            model.fit(
-                    train_main, power,
-                    validation_split=.15,
+            if not split.metadata.should_train:
+                continue
+
+            with temporary_checkpoint(".h5") as filepath:
+                callbacks = []
+                validation_data = None
+                if split.metadata.validation_enabled:
+                    checkpoint = ModelCheckpoint(
+                            str(filepath),
+                            monitor='val_loss',
+                            verbose=1 if self.verbose else 0,
+                            save_best_only=True,
+                            mode='min',
+                    )
+                    callbacks.append(checkpoint)
+                    validation_data = (split.X_val, split.y_val)
+
+                model.fit(
+                    split.X_train,
+                    split.y_train,
+                    validation_data=validation_data,
                     batch_size=self.batch_size,
                     epochs=self.n_epochs,
-                    callbacks=[ checkpoint ],
+                    callbacks=callbacks,
                     shuffle=True,
-            )
-            model.load_weights(filepath)
+                    verbose=1 if self.verbose else 0,
+                )
+                if split.metadata.validation_enabled and filepath.exists():
+                    load_keras_weights(model, str(filepath))
+                elif not split.metadata.validation_enabled:
+                    save_keras_weights(model, str(filepath))
+                    load_keras_weights(model, str(filepath))
 
         if self.save_model_path:
             self.save_model()
 
     def load_model(self):
-        print ("Loading the model using the pretrained-weights")        
+        logger.info("Loading the model using pretrained weights.")
         model_folder = self.load_model_path
-        with open(os.path.join(model_folder, "model.json"), "r") as f:
-            model_string = f.read().strip()
-            params_to_load = json.loads(model_string)
+        metadata_path = os.path.join(model_folder, "metadata.json")
+        if os.path.exists(metadata_path):
+            params_to_load = load_metadata(
+                    model_folder,
+                    expected_model_class=self.MODEL_NAME,
+                    expected_backend="tensorflow",
+            )
+        else:
+            logger.warning("Loading legacy %s model metadata from model.json.", self.MODEL_NAME)
+            with open(os.path.join(model_folder, "model.json"), "r") as f:
+                model_string = f.read().strip()
+                params_to_load = json.loads(model_string)
 
 
         self.sequence_length = int(params_to_load['sequence_length'])
@@ -99,23 +150,36 @@ def load_model(self):
 
         for appliance_name in self.appliance_params:
             self.models[appliance_name] = self.return_network()
-            self.models[appliance_name].load_weights(os.path.join(model_folder,appliance_name+".h5"))
+            load_keras_weights(
+                    self.models[appliance_name],
+                    os.path.join(model_folder,appliance_name+".h5"),
+            )
 
 
     def save_model(self):
         
-        os.makedirs(self.save_model_path)    
-        params_to_save = {}
-        params_to_save['appliance_params'] = self.appliance_params
-        params_to_save['sequence_length'] = self.sequence_length
-        params_to_save['mains_mean'] = self.mains_mean
-        params_to_save['mains_std'] = self.mains_std
+        os.makedirs(self.save_model_path, exist_ok=True)
+        metadata = build_metadata(
+                model_class=self.MODEL_NAME,
+                backend="tensorflow",
+                sequence_length=self.sequence_length,
+                appliance_params=self.appliance_params,
+                mains_mean=self.mains_mean,
+                mains_std=self.mains_std,
+                dependencies=collect_dependencies([
+                    "nilmtk-contrib",
+                    "tensorflow",
+                    "numpy",
+                    "pandas",
+                ]),
+        )
+        save_metadata(self.save_model_path, metadata)
         for appliance_name in self.models:
-            print ("Saving model for ", appliance_name)
-            self.models[appliance_name].save_weights(os.path.join(self.save_model_path,appliance_name+".h5"))
-
-        with open(os.path.join(self.save_model_path,'model.json'),'w') as file:
-            file.write(json.dumps(params_to_save))
+            logger.info("Saving %s model for %s.", self.MODEL_NAME, appliance_name)
+            save_keras_weights(
+                    self.models[appliance_name],
+                    os.path.join(self.save_model_path,appliance_name+".h5"),
+            )
 
 
 
@@ -211,9 +275,9 @@ def denormalize_output(self,data,mean,std):
     
     def set_appliance_params(self,train_appliances):
         for (app_name,df_list) in train_appliances:
-            l = np.array(pd.concat(df_list,axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
+            values = np.array(pd.concat(df_list,axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
             if app_std<1:
                 app_std = 100
             self.appliance_params.update({app_name:{'mean':app_mean,'std':app_std}})
diff --git a/nilmtk_contrib/disaggregate/dsc.py b/nilmtk_contrib/disaggregate/dsc.py
index 017a9e7..4ef74ee 100644
--- a/nilmtk_contrib/disaggregate/dsc.py
+++ b/nilmtk_contrib/disaggregate/dsc.py
@@ -1,19 +1,26 @@
 from __future__ import print_function, division
-from warnings import warn
 from nilmtk.disaggregate import Disaggregator
 import pandas as pd
 import numpy as np
 from collections import OrderedDict 
-import matplotlib.pyplot as  plt
 from sklearn.decomposition import MiniBatchDictionaryLearning, SparseCoder
 from sklearn.metrics import mean_squared_error
 import time
-import warnings
-warnings.filterwarnings("ignore")
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger
+from nilmtk_contrib.utils.params import (
+    validate_non_negative_int,
+    validate_positive_int,
+    validate_positive_number,
+)
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 
 class DSC(Disaggregator):
     
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy"))
+        super().__init__()
 
         self.MODEL_NAME = 'DSC'  # Add the name for the algorithm
         self.chunk_wise_training = False
@@ -29,6 +36,14 @@ def __init__(self, params):
         self.iterations = params.get('iterations',self.iterations)
         self.n_epochs = self.iterations
         self.n_components = params.get('n_components',self.n_components)
+        self.sparsity_coef = params.get('sparsity_coef', self.sparsity_coef)
+        self.shape = validate_positive_int("shape", self.shape)
+        self.iterations = validate_non_negative_int("iterations", self.iterations)
+        self.n_epochs = self.iterations
+        self.n_components = validate_positive_int("n_components", self.n_components)
+        self.learning_rate = validate_positive_number("learning_rate", self.learning_rate)
+        self.sparsity_coef = validate_positive_number("sparsity_coef", self.sparsity_coef)
+        self.padding_metadata = []
 
     def learn_dictionary(self, appliance_main, app_name):
 
@@ -39,15 +54,15 @@ def learn_dictionary(self, appliance_main, app_name):
         self.power[app_name] = appliance_main
 
         if app_name not in self.dictionaries:
-            print ("Training First dictionary for ",app_name)
+            _log_print("Training First dictionary for ",app_name)
             model = MiniBatchDictionaryLearning(n_components=self.n_components,positive_code=True,positive_dict=True,transform_algorithm='lasso_lars',alpha=self.sparsity_coef)
         
         else:
-            print ("Re-training dictionary for ",app_name)
+            _log_print("Re-training dictionary for ",app_name)
             model = self.dictionaries[app_name]
         model.fit(appliance_main.T)
         reconstruction = np.matmul(model.components_.T,model.transform(appliance_main.T).T)
-        print ("RMSE reconstruction for appliance %s is %s"%(app_name,mean_squared_error(reconstruction,appliance_main)**(.5)))
+        _log_print("RMSE reconstruction for appliance %s is %s"%(app_name,mean_squared_error(reconstruction,appliance_main)**(.5)))
         self.dictionaries[app_name] = model
         
 
@@ -73,10 +88,10 @@ def discriminative_training(self,concatenated_activations,concatenated_bases, ve
         train_optimal_a = optimal_a[:,:-v_index]
         v_optimal_a = optimal_a[:,-v_index:]
 
-        print ("If Iteration wise errors are not decreasing, then please decrease the learning rate")
+        _log_print("If Iteration wise errors are not decreasing, then please decrease the learning rate")
         for i in range(self.iterations):
 
-            a = time.time()
+            time.time()
             # Finding activations for the given bases
             model = SparseCoder(dictionary=predicted_b.T,positive_code=True,transform_algorithm='lasso_lars',transform_alpha=self.sparsity_coef)
             train_predicted_a = model.transform(train_power.T).T
@@ -85,7 +100,7 @@ def discriminative_training(self,concatenated_activations,concatenated_bases, ve
             err = np.mean(np.abs(val_predicted_a - v_optimal_a))
 
             if err<least_error:
-                #print ("Chose the best")
+                #_log_print("Chose the best")
                 least_error = err
                 best_b = np.copy(predicted_b)
                 
@@ -96,8 +111,8 @@ def discriminative_training(self,concatenated_activations,concatenated_bases, ve
             predicted_b = np.where(predicted_b>0,predicted_b,0)
             # Making sure that columns sum to 1
             predicted_b = (predicted_b.T/np.linalg.norm(predicted_b.T,axis=1).reshape((-1,1))).T 
-            #if i%verbose==0:
-            print ("Iteration ",i," Error ",err)
+            if self.verbose and verbose and i % verbose == 0:
+                _log_print("Iteration ",i," Error ",err)
 
         return  best_b
 
@@ -110,13 +125,13 @@ def print_appliance_wise_errors(self, activations, bases):
             pred = np.matmul(bases[:,start_comp:start_comp+n_comps],activations[start_comp:start_comp+n_comps,:])
             start_comp+=n_comps
             #plt.plot(pred.T[home_id],label=i)
-            print ("Error for ",i," is ",mean_squared_error(pred, X)**(.5))
+            _log_print("Error for ",i," is ",mean_squared_error(pred, X)**(.5))
         
     def partial_fit(self, train_main, train_appliances, **load_kwargs):
         
-        print("...............DSC partial_fit running...............")
+        _log_print("...............DSC partial_fit running...............")
 
-        #print (train_main[0])
+        #_log_print(train_main[0])
 
         train_main = pd.concat(train_main,axis=1) #np.array([i.values.reshape((self.sequence_length,1)) for i in train_main])
         
@@ -151,39 +166,48 @@ def partial_fit(self, train_main, train_appliances, **load_kwargs):
 
             concatenated_bases = np.concatenate(concatenated_bases,axis=1)
             concatenated_activations = np.concatenate(concatenated_activations,axis=0)
-            print ("--"*15)
-            print ("Optimal Errors")
+            _log_print("--"*15)
+            _log_print("Optimal Errors")
             self.print_appliance_wise_errors(concatenated_activations, concatenated_bases)
-            print ("--"*15)
+            _log_print("--"*15)
             model = SparseCoder(dictionary=concatenated_bases.T,positive_code=True,transform_algorithm='lasso_lars',transform_alpha=self.sparsity_coef)
             predicted_activations = model.transform(train_main.T).T
-            print ('\n\n')
-            print ("--"*15)
-            print ("Error in prediction before discriminative sparse coding")
+            _log_print('\n\n')
+            _log_print("--"*15)
+            _log_print("Error in prediction before discriminative sparse coding")
             self.print_appliance_wise_errors(predicted_activations, concatenated_bases)
-            print ("--"*15)
-            print ('\n\n')
+            _log_print("--"*15)
+            _log_print('\n\n')
             optimal_b = self.discriminative_training(concatenated_activations,concatenated_bases)
             model = SparseCoder(dictionary=optimal_b.T,positive_code=True,transform_algorithm='lasso_lars',transform_alpha=self.sparsity_coef)
             self.disggregation_model = model
             predicted_activations = model.transform(train_main.T).T
-            print ("--"*15)
-            print ("Model Errors after Discriminative Training")
+            _log_print("--"*15)
+            _log_print("Model Errors after Discriminative Training")
             self.print_appliance_wise_errors(predicted_activations, concatenated_bases)
-            print ("--"*15)
+            _log_print("--"*15)
             self.disaggregation_bases = optimal_b
             self.reconstruction_bases = concatenated_bases
             
         else:
-            print ("This chunk has small number of samples, so skipping the training")
+            _log_print("This chunk has small number of samples, so skipping the training")
 
     def disaggregate_chunk(self, test_main_list):
 
         test_predictions = []
         for test_main in test_main_list:
+            original_length = test_main.size
+            extra_values = 0
             if test_main.size%self.shape!=0:
                 extra_values = self.shape - (test_main.size)%(self.shape)
                 test_main = list(test_main.values.flatten()) + [0]*extra_values
+            self.padding_metadata.append(
+                {
+                    "original_length": original_length,
+                    "padded_length": original_length + extra_values,
+                    "extra_values": extra_values,
+                }
+            )
             test_main = np.array(test_main).reshape((-1,self.shape)).T
             predicted_activations = self.disggregation_model.transform(test_main.T).T
             #predicted_usage = self.reconstruction_bases@predicted_activations
@@ -194,7 +218,9 @@ def disaggregate_chunk(self, test_main_list):
                 predicted_usage = np.matmul(self.reconstruction_bases[:,start_comp:start_comp+n_comps],predicted_activations[start_comp:start_comp+n_comps,:])
                 start_comp+=n_comps
                 predicted_usage = predicted_usage.T.flatten() 
+                predicted_usage = predicted_usage[:original_length]
                 flat_mains = test_main.T.flatten()
+                flat_mains = flat_mains[:original_length]
                 predicted_usage = np.where(predicted_usage>flat_mains,flat_mains,predicted_usage)
                 disggregation_dict[app_name] = pd.Series(predicted_usage)
             results = pd.DataFrame(disggregation_dict, dtype='float32')
diff --git a/nilmtk_contrib/disaggregate/resnet.py b/nilmtk_contrib/disaggregate/resnet.py
index 7964c5e..ce8ac4c 100644
--- a/nilmtk_contrib/disaggregate/resnet.py
+++ b/nilmtk_contrib/disaggregate/resnet.py
@@ -1,32 +1,26 @@
 from __future__ import print_function, division
-from warnings import warn
 
-from tensorflow.keras.layers import Conv2D, ZeroPadding1D,MaxPooling1D
+from tensorflow.keras.layers import ZeroPadding1D,MaxPooling1D
 from tensorflow.keras.layers import Activation
 from tensorflow.keras.layers import BatchNormalization
-from tensorflow.keras.layers import AveragePooling1D
 
 from nilmtk.disaggregate import Disaggregator
-from tensorflow.keras.layers import Layer,Conv1D, Dense, Dropout, Reshape, Flatten,Add,MaxPool1D,BatchNormalization
-import os
+from tensorflow.keras.layers import Layer,Conv1D, Dense, Dropout, Flatten,Add
 import pandas as pd
 import numpy as np
-import pickle
 from collections import OrderedDict
 
-from tensorflow.keras.optimizers import SGD
-from tensorflow.keras.models import Sequential, load_model
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
+from tensorflow.keras.models import Sequential
+from nilmtk_contrib.utils.validation import safe_train_test_split as train_test_split
 from tensorflow.keras.callbacks import ModelCheckpoint
-import tensorflow.keras.backend as K
 import tensorflow as tf
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 gpus=tf.config.experimental.list_physical_devices("GPU")
 for gpu in gpus:
     tf.config.experimental.set_memory_growth(gpu,True)
-import random
-random.seed(10)
-np.random.seed(10)
 
 
 class SequenceLengthError(Exception):
@@ -135,6 +129,7 @@ def get_config(self):
 class ResNet(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "tensorflow"))
 
         self.MODEL_NAME = "ResNet"
         self.chunk_wise_training = params.get('chunk_wise_training',False)
@@ -147,12 +142,12 @@ def __init__(self, params):
         self.load_model_path=params.get('load_model_path',None)
         self.appliance_params = params.get('appliance_params',{})
         if self.sequence_length%2==0:
-            print ("Sequence length should be odd!")
+            _log_print("Sequence length should be odd!")
             raise (SequenceLengthError)
 
     def partial_fit(self,train_main,train_appliances,do_preprocessing=True,**load_kwargs):
 
-        print("...............ResNet partial_fit running...............")
+        _log_print("...............ResNet partial_fit running...............")
         if len(self.appliance_params) == 0:
             self.set_appliance_params(train_appliances)
 
@@ -168,23 +163,23 @@ def partial_fit(self,train_main,train_appliances,do_preprocessing=True,**load_kw
             app_df_values = app_df.values.reshape((-1,self.sequence_length))
             new_train_appliances.append((app_name, app_df_values))
         train_appliances = new_train_appliances
-        print(train_appliances)
+        _log_print(train_appliances)
         for appliance_name, power in train_appliances:
             if appliance_name not in self.models:
-                print("First model training for ", appliance_name)
+                _log_print("First model training for ", appliance_name)
                 self.models[appliance_name] = self.return_network()
             else:
-                print("Started Retraining model for ", appliance_name)
+                _log_print("Started Retraining model for ", appliance_name)
 
             model = self.models[appliance_name]
             if train_main.size > 0:
                 # Sometimes chunks can be empty after dropping NANS
                 if len(train_main) > 10:
                     # Do validation when you have sufficient samples
-                    filepath = 'ResNet-temp-weights-'+str(random.randint(0,100000))+'.h5'
+                    filepath = checkpoint_path(".h5")
                     checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1,save_best_only=True,mode='min')
                     train_x, v_x, train_y, v_y = train_test_split(train_main, power, test_size=.15,random_state=10)
-                    history=model.fit(train_x,train_y,validation_data=(v_x,v_y),epochs=self.n_epochs,callbacks=[checkpoint],batch_size=self.batch_size)
+                    model.fit(train_x,train_y,validation_data=(v_x,v_y),epochs=self.n_epochs,callbacks=[checkpoint],batch_size=self.batch_size)
                     model.load_weights(filepath)
 
 
@@ -216,14 +211,14 @@ def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
                 # the sum_arr keeps the number of times a particular timestamp has occured
                 # the predictions are summed for  agiven time, and is divided by the number of times it has occured
                 
-                l = self.sequence_length
-                n = len(prediction) + l - 1
+                window_length = self.sequence_length
+                n = len(prediction) + window_length - 1
                 sum_arr = np.zeros((n))
                 counts_arr = np.zeros((n))
-                o = len(sum_arr)
+                len(sum_arr)
                 for i in range(len(prediction)):
-                    sum_arr[i:i + l] += prediction[i].flatten()
-                    counts_arr[i:i + l] += 1
+                    sum_arr[i:i + window_length] += prediction[i].flatten()
+                    counts_arr[i:i + window_length] += 1
                 for i in range(len(sum_arr)):
                     sum_arr[i] = sum_arr[i] / counts_arr[i]
 
@@ -287,10 +282,10 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                 if app_name in self.appliance_params:
                     app_mean = self.appliance_params[app_name]['mean']
                     app_std = self.appliance_params[app_name]['std']
-                    app_min=self.appliance_params[app_name]['min']
-                    app_max=self.appliance_params[app_name]['max']
+                    self.appliance_params[app_name]['min']
+                    self.appliance_params[app_name]['max']
                 else:
-                    print ("Parameters for ", app_name ," were not found!")
+                    _log_print("Parameters for ", app_name ," were not found!")
                     raise ApplianceNotFoundError()
 
 
@@ -305,7 +300,7 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                     
                 appliance_list.append((app_name, processed_app_dfs))
                 #new_app_readings = np.array([ new_app_readings[i:i+n] for i in range(len(new_app_readings)-n+1) ])
-                #print (new_mains.shape, new_app_readings.shape, app_name)
+                #_log_print(new_mains.shape, new_app_readings.shape, app_name)
 
             return processed_mains_lst, appliance_list
 
@@ -325,11 +320,11 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
     def set_appliance_params(self,train_appliances):
 
         for (app_name,df_list) in train_appliances:
-            l = np.array(pd.concat(df_list,axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
-            app_max=np.max(l)
-            app_min=np.min(l)
+            values = np.array(pd.concat(df_list,axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            app_max=np.max(values)
+            app_min=np.min(values)
             if app_std<1:
                 app_std = 100
             self.appliance_params.update({app_name:{'mean':app_mean,'std':app_std,'max':app_max,'min':app_min}})
diff --git a/nilmtk_contrib/disaggregate/resnet_classification.py b/nilmtk_contrib/disaggregate/resnet_classification.py
index 0e0dbaf..952b2a1 100644
--- a/nilmtk_contrib/disaggregate/resnet_classification.py
+++ b/nilmtk_contrib/disaggregate/resnet_classification.py
@@ -1,33 +1,30 @@
 from __future__ import print_function, division
-from warnings import warn
 from nilmtk.disaggregate import Disaggregator
-from tensorflow.keras.layers import Layer,Conv1D, Dense, Dropout, Reshape, Flatten, Bidirectional, LSTM, Input, Multiply, Activation, Add
-from tensorflow.keras.layers import Conv2D, ZeroPadding1D,MaxPooling1D
+from tensorflow.keras.layers import Layer,Conv1D, Dense, Dropout, Flatten, Input, Multiply, Activation, Add
+from tensorflow.keras.layers import ZeroPadding1D,MaxPooling1D
 from tensorflow.keras.layers import BatchNormalization
 from tensorflow.keras import Model
-import os
-import pickle
 import pandas as pd
 import numpy as np
 from collections import OrderedDict
 from tensorflow.keras.optimizers import SGD
 from tensorflow.keras.losses import BinaryCrossentropy,MeanSquaredError
-from tensorflow.keras.models import Sequential, load_model
-import matplotlib.pyplot as plt
-import matplotlib as mlp
-from sklearn.model_selection import train_test_split
+from nilmtk_contrib.utils.validation import safe_train_test_split as train_test_split
 from tensorflow.keras.callbacks import ModelCheckpoint
-import tensorflow.keras.backend as K
 import tensorflow as tf
+import copy
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+from nilmtk_contrib.preprocessing.classification import (
+    appliance_threshold,
+    classification_metadata,
+    loss_weight_metadata,
+)
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 gpus=tf.config.experimental.list_physical_devices("GPU")
 for gpu in gpus:
     tf.config.experimental.set_memory_growth(gpu,True)
-import random
-import sys
-random.seed(10)
-np.random.seed(10)
-
-import copy
 
 class SequenceLengthError(Exception):
     pass
@@ -140,6 +137,7 @@ def get_config(self):
 class ResNet_classification(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "tensorflow"))
 
         self.MODEL_NAME = "ResNet_classification"
         self.chunk_wise_training = params.get('chunk_wise_training',False)
@@ -151,13 +149,24 @@ def __init__(self, params):
         self.batch_size = params.get('batch_size',512)
         self.appliance_params = params.get('appliance_params',{})
         self.mains_params=params.get('mains_params',{})
+        self.classification_threshold = params.get('classification_threshold', params.get('on_power_threshold', 15))
+        self.regression_loss_weight = params.get('regression_loss_weight', 1.0)
+        self.classification_loss_weight = params.get('classification_loss_weight', 1.0)
+        self.classification_metadata = classification_metadata(
+            self.appliance_params,
+            self.classification_threshold,
+        )
+        self.loss_weight_metadata = loss_weight_metadata(
+            self.regression_loss_weight,
+            self.classification_loss_weight,
+        )
         if self.sequence_length%2==0:
-            print ("Sequence length should be odd!")
+            _log_print("Sequence length should be odd!")
             raise (SequenceLengthError)
 
     def partial_fit(self,train_main,train_appliances,do_preprocessing=True,**load_kwargs):
 
-        print("...............ResNet_classification partial_fit running...............")
+        _log_print("...............ResNet_classification partial_fit running...............")
         if len(self.appliance_params) == 0:
             self.set_appliance_params(train_appliances)
 
@@ -190,17 +199,17 @@ def partial_fit(self,train_main,train_appliances,do_preprocessing=True,**load_kw
         
         for appliance_name, power in train_appliances:
             if appliance_name not in self.models:
-                print("First model training for ", appliance_name)
+                _log_print("First model training for ", appliance_name)
                 self.models[appliance_name] = self.return_network()
             else:
-                print("Started Retraining model for ", appliance_name)
+                _log_print("Started Retraining model for ", appliance_name)
 
             model = self.models[appliance_name]
             if train_main.size > 0:
                 # Sometimes chunks can be empty after dropping NANS
                 if len(train_main) > 10:
                     # Do validation when you have sufficient samples
-                    filepath = 'ResNet_classification-temp-weights-'+str(random.randint(0,100000))+'.h5'
+                    filepath = checkpoint_path(".h5")
                     checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1,save_best_only=True,mode='min')
 
                     power=pd.DataFrame(power)
@@ -214,7 +223,7 @@ def partial_fit(self,train_main,train_appliances,do_preprocessing=True,**load_kw
                     appliance_train_classification=train_class_y[:,self.sequence_length:]
                     appliance_val_classification=v_class_y[:,self.sequence_length:]
                    
-                    history=model.fit(train_x,[train_y,appliance_train_classification],validation_data=(v_x,[v_y,appliance_val_classification]),epochs=self.n_epochs,callbacks=[checkpoint],batch_size=self.batch_size)
+                    model.fit(train_x,[train_y,appliance_train_classification],validation_data=(v_x,[v_y,appliance_val_classification]),epochs=self.n_epochs,callbacks=[checkpoint],batch_size=self.batch_size)
                     model.load_weights(filepath)
 
     def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
@@ -243,27 +252,27 @@ def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
                 # the sum_arr keeps the number of times a particular timestamp has occured
                 # the predictions are summed for  agiven time, and is divided by the number of times it has occured
                 
-                l = self.sequence_length
-                n = len(prediction_output) + l - 1
+                window_length = self.sequence_length
+                n = len(prediction_output) + window_length - 1
                 sum_arr = np.zeros((n))
                 counts_arr = np.zeros((n))
-                o = len(sum_arr)
+                len(sum_arr)
                 for i in range(len(prediction_output)):
-                    sum_arr[i:i + l] += prediction_output[i].flatten()
-                    counts_arr[i:i + l] += 1
+                    sum_arr[i:i + window_length] += prediction_output[i].flatten()
+                    counts_arr[i:i + window_length] += 1
                 for i in range(len(sum_arr)):
                     sum_arr[i] = sum_arr[i] / counts_arr[i]
 
                 prediction = (self.appliance_params[appliance]['min'] + (sum_arr * (self.appliance_params[appliance]['max']-self.appliance_params[appliance]['min'])))
 
-                l = self.sequence_length
-                n = len(prediction_classification) + l - 1
+                window_length = self.sequence_length
+                n = len(prediction_classification) + window_length - 1
                 sum_arr = np.zeros((n))
                 counts_arr = np.zeros((n))
-                o = len(sum_arr)
+                len(sum_arr)
                 for i in range(len(prediction_classification)):
-                    sum_arr[i:i + l] += prediction_classification[i].flatten()
-                    counts_arr[i:i + l] += 1
+                    sum_arr[i:i + window_length] += prediction_classification[i].flatten()
+                    counts_arr[i:i + window_length] += 1
                 for i in range(len(sum_arr)):
                     sum_arr[i] = sum_arr[i] / counts_arr[i]
                     
@@ -286,9 +295,6 @@ def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
 
     def return_network(self):
 
-        filters = 32
-        kernel_size = 4
-        units = 128
         input_data = Input(shape=(self.sequence_length, 1))
 
         #This classificcation network is inspired from:-
@@ -330,20 +336,30 @@ def return_network(self):
         optimizer = SGD(learning_rate=0.01, momentum=0.9)
         full_model.summary()
         #Two outputs of the model the classification output and the final output
-        full_model.compile(optimizer=optimizer, loss={"output": MeanSquaredError(),"classification_output": BinaryCrossentropy()})
+        full_model.compile(
+            optimizer=optimizer,
+            loss={"output": MeanSquaredError(),"classification_output": BinaryCrossentropy()},
+            loss_weights={
+                "output": self.regression_loss_weight,
+                "classification_output": self.classification_loss_weight,
+            },
+        )
         return full_model
 
     def classify(self,classify_appliance):
         appliance_on_off = []
-        #Threshold for on-off
-        THRESHOLD=15
         for app_index, (appliance_name, on_off_list) in enumerate(classify_appliance):
+            threshold = appliance_threshold(
+                self.appliance_params,
+                appliance_name,
+                self.classification_threshold,
+            )
             classification_appliance_dfs = []
             for appliance in on_off_list:
                 n = self.sequence_length
                 units_to_pad = n // 2
-                appliance[appliance <= THRESHOLD] = 0
-                appliance[appliance > THRESHOLD] = 1
+                appliance[appliance <= threshold] = 0
+                appliance[appliance > threshold] = 1
                 new_app_readings = appliance.values.flatten()
                 new_app_readings = np.pad(new_app_readings, (units_to_pad,units_to_pad),'constant',constant_values = (0,0))
                 new_app_readings = np.array([new_app_readings[i:i + n] for i in range(len(new_app_readings) - n + 1)]) 
@@ -369,12 +385,12 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
             for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
 
                 if app_name in self.appliance_params:
-                    app_mean = self.appliance_params[app_name]['mean']
-                    app_std = self.appliance_params[app_name]['std']
+                    self.appliance_params[app_name]['mean']
+                    self.appliance_params[app_name]['std']
                     app_min=self.appliance_params[app_name]['min']
                     app_max=self.appliance_params[app_name]['max']
                 else:
-                    print ("Parameters for ", app_name ," were not found!")
+                    _log_print("Parameters for ", app_name ," were not found!")
                     raise ApplianceNotFoundError()
 
 
@@ -406,15 +422,15 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
             return processed_mains_lst
 
     def set_mains_params(self,train_main):
-        l=[]
+        values=[]
         for mains in train_main :
             new_mains = mains.values.flatten()
-            l.extend(new_mains)
+            values.extend(new_mains)
        
-        main_mean=np.mean(l)
-        main_std=np.std(l)
-        main_min=np.min(l)
-        main_max=np.max(l)
+        main_mean=np.mean(values)
+        main_std=np.std(values)
+        main_min=np.min(values)
+        main_max=np.max(values)
         self.mains_params.update({'mean':main_mean,'std':main_std,'min':main_min,'max':main_max})
 
 
@@ -422,11 +438,11 @@ def set_mains_params(self,train_main):
     def set_appliance_params(self,train_appliances):
 
         for (app_name,df_list) in train_appliances:
-            l = np.array(pd.concat(df_list,axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
-            app_max=np.max(l)
-            app_min=np.min(l)
+            values = np.array(pd.concat(df_list,axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            app_max=np.max(values)
+            app_min=np.min(values)
             if app_std<1:
                 app_std = 100
             self.appliance_params.update({app_name:{'mean':app_mean,'std':app_std,'min':app_min,'max':app_max}})
diff --git a/nilmtk_contrib/disaggregate/rnn.py b/nilmtk_contrib/disaggregate/rnn.py
index 1b632d4..388aa3a 100644
--- a/nilmtk_contrib/disaggregate/rnn.py
+++ b/nilmtk_contrib/disaggregate/rnn.py
@@ -7,6 +7,11 @@
 from tensorflow.keras.models import Sequential
 
 
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+from nilmtk_contrib.utils.validation import train_validation_split
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 class SequenceLengthError(Exception):
     pass
 
@@ -16,6 +21,7 @@ class ApplianceNotFoundError(Exception):
 class RNN(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "tensorflow"))
         """
         Parameters to be specified for the model
         """
@@ -31,7 +37,7 @@ def __init__(self, params):
         self.mains_mean = params.get('mains_mean',1800)
         self.mains_std = params.get('mains_std',600)
         if self.sequence_length%2==0:
-            print ("Sequence length should be odd!")
+            _log_print("Sequence length should be odd!")
             raise (SequenceLengthError)
 
 
@@ -40,7 +46,7 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, curre
         if len(self.appliance_params) == 0:
             self.set_appliance_params(train_appliances)
 
-        print("...............RNN partial_fit running...............")
+        _log_print("...............RNN partial_fit running...............")
         # Do the pre-processing, such as  windowing and normalizing
         if do_preprocessing:
             train_main, train_appliances = self.call_preprocessing(
@@ -58,30 +64,32 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, curre
         for appliance_name, power in train_appliances:
             # Check if the appliance was already trained. If not then create a new model for it
             if appliance_name not in self.models:
-                print("First model training for ", appliance_name)
+                _log_print("First model training for ", appliance_name)
                 self.models[appliance_name] = self.return_network()
             # Retrain the particular appliance
             else:
-                print("Started Retraining model for ", appliance_name)
+                _log_print("Started Retraining model for ", appliance_name)
 
             model = self.models[appliance_name]
             if train_main.size > 0:
                 # Sometimes chunks can be empty after dropping NANS
                 if len(train_main) > 10:
                     # Do validation when you have sufficient samples
-                    filepath = self.file_prefix + "-{}-epoch{}.h5".format(
-                            "_".join(appliance_name.split()),
-                            current_epoch,
-                    )
-                    checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1,save_best_only=True,mode='min')
+                    filepath = checkpoint_path(".h5")
+                    checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1 if self.verbose else 0,save_best_only=True,mode='min')
+                    split = train_validation_split(train_main, power, validation_fraction=0.15, strategy='tail', allow_no_validation=True)
+                    if not split.metadata.should_train:
+                        continue
                     model.fit(
-                            train_main, power,
-                            validation_split=.15,
+                            split.X_train, split.y_train,
+                            validation_data=(split.X_val, split.y_val) if split.metadata.validation_enabled else None,
                             epochs=self.n_epochs,
                             batch_size=self.batch_size,
-                            callbacks=[ checkpoint ],
+                            callbacks=[checkpoint] if split.metadata.validation_enabled else [],
+                            verbose=1 if self.verbose else 0,
                     )
-                    model.load_weights(filepath)
+                    if split.metadata.validation_enabled and filepath.exists():
+                        model.load_weights(filepath)
 
     def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
 
@@ -149,7 +157,7 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                     app_mean = self.appliance_params[app_name]['mean']
                     app_std = self.appliance_params[app_name]['std']
                 else:
-                    print ("Parameters for ", app_name ," were not found!")
+                    _log_print("Parameters for ", app_name ," were not found!")
                     raise ApplianceNotFoundError()
 
                 processed_appliance_dfs = []
@@ -179,10 +187,10 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
     def set_appliance_params(self,train_appliances):
         # Find the parameters using the first
         for (app_name,df_list) in train_appliances:
-            l = np.array(pd.concat(df_list,axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
+            values = np.array(pd.concat(df_list,axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
             if app_std<1:
                 app_std = 100
             self.appliance_params.update({app_name:{'mean':app_mean,'std':app_std}})
-        print (self.appliance_params)
+        _log_print(self.appliance_params)
diff --git a/nilmtk_contrib/disaggregate/rnn_attention.py b/nilmtk_contrib/disaggregate/rnn_attention.py
index ae1dc7b..0f80570 100644
--- a/nilmtk_contrib/disaggregate/rnn_attention.py
+++ b/nilmtk_contrib/disaggregate/rnn_attention.py
@@ -1,25 +1,19 @@
 from __future__ import print_function, division
-from warnings import warn
 from nilmtk.disaggregate import Disaggregator
-from tensorflow.keras.layers import Conv1D, Dense, Dropout, Reshape, Flatten, Bidirectional, LSTM
+from tensorflow.keras.layers import Conv1D, Dense, Bidirectional, LSTM
 from tensorflow.keras.layers import Layer
-import os
-import pickle
 import pandas as pd
 import numpy as np
 from collections import OrderedDict
-from tensorflow.keras.optimizers import SGD
-from tensorflow.keras.models import Sequential, load_model
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
+from tensorflow.keras.models import Sequential
+from nilmtk_contrib.utils.validation import safe_train_test_split as train_test_split
 from tensorflow.keras.callbacks import ModelCheckpoint
 import tensorflow.keras.backend as K
 import tensorflow as tf
-import random
-import sys
-random.seed(10)
-np.random.seed(10)
-import tensorflow as tf
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 gpus=tf.config.experimental.list_physical_devices("GPU")
 for gpu in gpus:
     tf.config.experimental.set_memory_growth(gpu,True)
@@ -60,6 +54,7 @@ def get_config(self):
 class RNN_attention(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "tensorflow"))
         """
         Parameters to be specified for the model
         """
@@ -75,7 +70,7 @@ def __init__(self, params):
         self.mains_mean = params.get('mains_mean',1800)
         self.mains_std = params.get('mains_std',600)
         if self.sequence_length%2==0:
-            print ("Sequence length should be odd!")
+            _log_print("Sequence length should be odd!")
             raise (SequenceLengthError)
 
     def partial_fit(self,train_main,train_appliances,do_preprocessing=True,
@@ -85,7 +80,7 @@ def partial_fit(self,train_main,train_appliances,do_preprocessing=True,
         if len(self.appliance_params) == 0:
             self.set_appliance_params(train_appliances)  
 
-        print("...............RNN_attention partial_fit running...............")
+        _log_print("...............RNN_attention partial_fit running...............")
         # Do the pre-processing, such as  windowing and normalizing
 
         if do_preprocessing:
@@ -105,18 +100,18 @@ def partial_fit(self,train_main,train_appliances,do_preprocessing=True,
         for appliance_name, power in train_appliances:
             # Check if the appliance was already trained. If not then create a new model for it
             if appliance_name not in self.models:
-                print("First model training for ", appliance_name)
+                _log_print("First model training for ", appliance_name)
                 self.models[appliance_name] = self.return_network()
             # Retrain the particular appliance
             else:
-                print("Started Retraining model for ", appliance_name)
+                _log_print("Started Retraining model for ", appliance_name)
 
             model = self.models[appliance_name]
             if train_main.size > 0:
                 # Sometimes chunks can be empty after dropping NANS
                 if len(train_main) > 10:
                     # Do validation when you have sufficient samples
-                    filepath = 'RNN_attention-temp-weights-'+str(random.randint(0,100000))+'.h5'
+                    filepath = checkpoint_path(".h5")
                     checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1,save_best_only=True,mode='min')
                     train_x, v_x, train_y, v_y = train_test_split(train_main, power, test_size=.15,random_state=10)
                     model.fit(train_x,train_y,validation_data=(v_x,v_y),epochs=self.n_epochs,callbacks=[checkpoint],batch_size=self.batch_size)
@@ -189,7 +184,7 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                     app_mean = self.appliance_params[app_name]['mean']
                     app_std = self.appliance_params[app_name]['std']
                 else:
-                    print ("Parameters for ", app_name ," were not found!")
+                    _log_print("Parameters for ", app_name ," were not found!")
                     raise ApplianceNotFoundError()
 
                 processed_appliance_dfs = []
@@ -219,11 +214,10 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
     def set_appliance_params(self,train_appliances):
         # Find the parameters using the first
         for (app_name,df_list) in train_appliances:
-            l = np.array(pd.concat(df_list,axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
+            values = np.array(pd.concat(df_list,axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
             if app_std<1:
                 app_std = 100
             self.appliance_params.update({app_name:{'mean':app_mean,'std':app_std}})
-        print (self.appliance_params)
- 
\ No newline at end of file
+        _log_print(self.appliance_params)
diff --git a/nilmtk_contrib/disaggregate/rnn_attention_classification.py b/nilmtk_contrib/disaggregate/rnn_attention_classification.py
index a3e6b86..37c621f 100644
--- a/nilmtk_contrib/disaggregate/rnn_attention_classification.py
+++ b/nilmtk_contrib/disaggregate/rnn_attention_classification.py
@@ -1,34 +1,29 @@
 from __future__ import print_function, division
-from warnings import warn
 from nilmtk.disaggregate import Disaggregator
-from tensorflow.keras.layers import Conv1D, Dense, Dropout, Reshape, Flatten, Bidirectional, LSTM, Input, Multiply, Activation, Add
-from tensorflow.keras.layers import Conv2D, ZeroPadding1D,MaxPooling1D
-from tensorflow.keras.layers import BatchNormalization
+from tensorflow.keras.layers import Conv1D, Dense, Flatten, Bidirectional, LSTM, Input, Multiply, Activation, Add
 from tensorflow.keras.layers import Layer
 from tensorflow.keras import Model
-import os
-import pickle
 import pandas as pd
 import numpy as np
 from collections import OrderedDict
 from tensorflow.keras.optimizers import SGD
 from tensorflow.keras.losses import BinaryCrossentropy,MeanSquaredError
-from tensorflow.keras.models import Sequential, load_model
-import matplotlib.pyplot as plt
-import matplotlib as mlp
-from sklearn.model_selection import train_test_split
+from nilmtk_contrib.utils.validation import safe_train_test_split as train_test_split
 from tensorflow.keras.callbacks import ModelCheckpoint
-import tensorflow.keras.backend as K
 import tensorflow as tf
+import copy
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+from nilmtk_contrib.preprocessing.classification import (
+    appliance_threshold,
+    classification_metadata,
+    loss_weight_metadata,
+)
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 gpus=tf.config.experimental.list_physical_devices("GPU")
 for gpu in gpus:
     tf.config.experimental.set_memory_growth(gpu,True)
-import random
-import sys
-random.seed(10)
-np.random.seed(10)
-
-import copy
 
 class SequenceLengthError(Exception):
     pass
@@ -161,6 +156,7 @@ def get_config(self):
 class RNN_attention_classification(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "tensorflow"))
 
         #self.MODEL_NAME = "RNNattention"
         self.MODEL_NAME = "RNN_attention_classification"
@@ -173,13 +169,24 @@ def __init__(self, params):
         self.batch_size = params.get('batch_size',512)
         self.appliance_params = params.get('appliance_params',{})
         self.mains_params=params.get('mains_params',{})
+        self.classification_threshold = params.get('classification_threshold', params.get('on_power_threshold', 15))
+        self.regression_loss_weight = params.get('regression_loss_weight', 1.0)
+        self.classification_loss_weight = params.get('classification_loss_weight', 1.0)
+        self.classification_metadata = classification_metadata(
+            self.appliance_params,
+            self.classification_threshold,
+        )
+        self.loss_weight_metadata = loss_weight_metadata(
+            self.regression_loss_weight,
+            self.classification_loss_weight,
+        )
         if self.sequence_length%2==0:
-            print ("Sequence length should be odd!")
+            _log_print("Sequence length should be odd!")
             raise (SequenceLengthError)
 
     def partial_fit(self,train_main,train_appliances,do_preprocessing=True,**load_kwargs):
 
-        print("...............RNN_attention_classification partial_fit running...............")
+        _log_print("...............RNN_attention_classification partial_fit running...............")
         if len(self.appliance_params) == 0:
             self.set_appliance_params(train_appliances)
         self.set_mains_params(train_main)  
@@ -209,17 +216,17 @@ def partial_fit(self,train_main,train_appliances,do_preprocessing=True,**load_kw
         self.att_models={}
         for appliance_name, power in train_appliances:
             if appliance_name not in self.models:
-                print("First model training for ", appliance_name)
+                _log_print("First model training for ", appliance_name)
                 self.models[appliance_name],self.att_models[appliance_name] = self.return_network()
             else:
-                print("Started Retraining model for ", appliance_name)
+                _log_print("Started Retraining model for ", appliance_name)
 
             model = self.models[appliance_name]
             if train_main.size > 0:
                 # Sometimes chunks can be empty after dropping NANS
                 if len(train_main) > 10:
                     # Do validation when you have sufficient samples
-                    filepath = 'RNN_attention_classification-temp-weights-'+str(random.randint(0,100000))+'.h5'
+                    filepath = checkpoint_path(".h5")
                     checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1,save_best_only=True,mode='min')
 
                     power=pd.DataFrame(power)
@@ -232,7 +239,7 @@ def partial_fit(self,train_main,train_appliances,do_preprocessing=True,**load_kw
                     v_y=v_class_y[:,:self.sequence_length]
                     appliance_train_classification=train_class_y[:,self.sequence_length:]
                     appliance_val_classification=v_class_y[:,self.sequence_length:]
-                    history=model.fit(train_x,[train_y,appliance_train_classification],validation_data=(v_x,[v_y,appliance_val_classification]),epochs=self.n_epochs,callbacks=[checkpoint],batch_size=self.batch_size)
+                    model.fit(train_x,[train_y,appliance_train_classification],validation_data=(v_x,[v_y,appliance_val_classification]),epochs=self.n_epochs,callbacks=[checkpoint],batch_size=self.batch_size)
                     model.load_weights(filepath)
 
     def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
@@ -255,34 +262,34 @@ def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
                 prediction = []
                 model = self.models[appliance]
                 prediction_output,prediction_classification = self.models[appliance].predict(x=test_main_array,batch_size=self.batch_size)
-                W=self.att_models[appliance].predict(x=test_main_array,batch_size=self.batch_size)
+                self.att_models[appliance].predict(x=test_main_array,batch_size=self.batch_size)
                 #####################
                 # This block is for creating the average of predictions over the different sequences
                 # the counts_arr keeps the number of times a particular timestamp has occured
                 # the sum_arr keeps the number of times a particular timestamp has occured
                 # the predictions are summed for  agiven time, and is divided by the number of times it has occured
                 
-                l = self.sequence_length
-                n = len(prediction_output) + l - 1
+                window_length = self.sequence_length
+                n = len(prediction_output) + window_length - 1
                 sum_arr = np.zeros((n))
                 counts_arr = np.zeros((n))
-                o = len(sum_arr)
+                len(sum_arr)
                 for i in range(len(prediction_output)):
-                    sum_arr[i:i + l] += prediction_output[i].flatten()
-                    counts_arr[i:i + l] += 1
+                    sum_arr[i:i + window_length] += prediction_output[i].flatten()
+                    counts_arr[i:i + window_length] += 1
                 for i in range(len(sum_arr)):
                     sum_arr[i] = sum_arr[i] / counts_arr[i]
 
                 prediction = (self.appliance_params[appliance]['min'] + (sum_arr * (self.appliance_params[appliance]['max']-self.appliance_params[appliance]['min'])))
 
-                l = self.sequence_length
-                n = len(prediction_classification) + l - 1
+                window_length = self.sequence_length
+                n = len(prediction_classification) + window_length - 1
                 sum_arr = np.zeros((n))
                 counts_arr = np.zeros((n))
-                o = len(sum_arr)
+                len(sum_arr)
                 for i in range(len(prediction_classification)):
-                    sum_arr[i:i + l] += prediction_classification[i].flatten()
-                    counts_arr[i:i + l] += 1
+                    sum_arr[i:i + window_length] += prediction_classification[i].flatten()
+                    counts_arr[i:i + window_length] += 1
                 for i in range(len(sum_arr)):
                     sum_arr[i] = sum_arr[i] / counts_arr[i]
                     
@@ -300,8 +307,6 @@ def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
 
     def return_network(self):
 
-        filters = 32
-        kernel_size = 4
         units = 128
         input_data = Input(shape=(self.sequence_length, 1))
         #This classificcation network is inspired from:-
@@ -341,22 +346,32 @@ def return_network(self):
         optimizer = SGD(learning_rate=0.01, momentum=0.9)
         full_model.summary()
         #Two outputs of the model the classification output and the final output
-        full_model.compile(optimizer=optimizer, loss={"output": MeanSquaredError(),"classification_output": BinaryCrossentropy()})
+        full_model.compile(
+            optimizer=optimizer,
+            loss={"output": MeanSquaredError(),"classification_output": BinaryCrossentropy()},
+            loss_weights={
+                "output": self.regression_loss_weight,
+                "classification_output": self.classification_loss_weight,
+            },
+        )
         return full_model,attention_model
 
 
     def classify(self,classify_appliance):
         appliance_on_off = []
-        #Threshold for on-off
-        THRESHOLD=15
 
         for app_index, (appliance_name, on_off_list) in enumerate(classify_appliance):
+            threshold = appliance_threshold(
+                self.appliance_params,
+                appliance_name,
+                self.classification_threshold,
+            )
             classification_appliance_dfs = []
             for appliance in on_off_list:
                 n = self.sequence_length
                 units_to_pad = n // 2
-                appliance[appliance <= THRESHOLD] = 0
-                appliance[appliance > THRESHOLD] = 1
+                appliance[appliance <= threshold] = 0
+                appliance[appliance > threshold] = 1
                 new_app_readings = appliance.values.flatten()
                 new_app_readings = np.pad(new_app_readings, (units_to_pad,units_to_pad),'constant',constant_values = (0,0))
                 new_app_readings = np.array([new_app_readings[i:i + n] for i in range(len(new_app_readings) - n + 1)]) 
@@ -384,12 +399,12 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
             for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
 
                 if app_name in self.appliance_params:
-                    app_mean = self.appliance_params[app_name]['mean']
-                    app_std = self.appliance_params[app_name]['std']
+                    self.appliance_params[app_name]['mean']
+                    self.appliance_params[app_name]['std']
                     app_min=self.appliance_params[app_name]['min']
                     app_max=self.appliance_params[app_name]['max']
                 else:
-                    print ("Parameters for ", app_name ," were not found!")
+                    _log_print("Parameters for ", app_name ," were not found!")
                     raise ApplianceNotFoundError()
 
 
@@ -405,7 +420,7 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                     
                 appliance_list.append((app_name, processed_app_dfs))
                 #new_app_readings = np.array([ new_app_readings[i:i+n] for i in range(len(new_app_readings)-n+1) ])
-                #print (new_mains.shape, new_app_readings.shape, app_name)
+                #_log_print(new_mains.shape, new_app_readings.shape, app_name)
 
             return processed_mains_lst, appliance_list
 
@@ -423,15 +438,15 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
             return processed_mains_lst
 
     def set_mains_params(self,train_main):
-        l=[]
+        values=[]
         for mains in train_main :
             new_mains = mains.values.flatten()
-            l.extend(new_mains)
+            values.extend(new_mains)
        
-        main_mean=np.mean(l)
-        main_std=np.std(l)
-        main_min=np.min(l)
-        main_max=np.max(l)
+        main_mean=np.mean(values)
+        main_std=np.std(values)
+        main_min=np.min(values)
+        main_max=np.max(values)
         self.mains_params.update({'mean':main_mean,'std':main_std,'min':main_min,'max':main_max})
 
 
@@ -439,11 +454,11 @@ def set_mains_params(self,train_main):
     def set_appliance_params(self,train_appliances):
 
         for (app_name,df_list) in train_appliances:
-            l = np.array(pd.concat(df_list,axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
-            app_max=np.max(l)
-            app_min=np.min(l)
+            values = np.array(pd.concat(df_list,axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            app_max=np.max(values)
+            app_min=np.min(values)
             if app_std<1:
                 app_std = 100
             self.appliance_params.update({app_name:{'mean':app_mean,'std':app_std,'min':app_min,'max':app_max}})
diff --git a/nilmtk_contrib/disaggregate/seq2point.py b/nilmtk_contrib/disaggregate/seq2point.py
index 2ba2cdd..2139902 100644
--- a/nilmtk_contrib/disaggregate/seq2point.py
+++ b/nilmtk_contrib/disaggregate/seq2point.py
@@ -3,10 +3,15 @@
 import pandas as pd
 from nilmtk.disaggregate import Disaggregator
 from tensorflow.keras.callbacks import ModelCheckpoint
-from tensorflow.keras.layers import Conv1D, Dense, Dropout, Reshape, Flatten
+from tensorflow.keras.layers import Conv1D, Dense, Dropout, Flatten
 from tensorflow.keras.models import Sequential
 
 
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+from nilmtk_contrib.utils.validation import train_validation_split
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 class SequenceLengthError(Exception):
     pass
 
@@ -16,6 +21,7 @@ class ApplianceNotFoundError(Exception):
 class Seq2Point(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "tensorflow"))
         """
         Parameters to be specified for the model
         """
@@ -31,7 +37,7 @@ def __init__(self, params):
         self.mains_mean = params.get('mains_mean',1800)
         self.mains_std = params.get('mains_std',600)
         if self.sequence_length%2==0:
-            print ("Sequence length should be odd!")
+            _log_print("Sequence length should be odd!")
             raise (SequenceLengthError)
 
     def partial_fit(self, train_main, train_appliances, do_preprocessing=True, current_epoch=0, **load_kwargs):
@@ -39,7 +45,7 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, curre
         if len(self.appliance_params) == 0:
             self.set_appliance_params(train_appliances)
 
-        print("...............Seq2Point partial_fit running...............")
+        _log_print("...............Seq2Point partial_fit running...............")
         # Do the pre-processing, such as  windowing and normalizing
         if do_preprocessing:
             train_main, train_appliances = self.call_preprocessing(
@@ -57,30 +63,32 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, curre
         for appliance_name, power in train_appliances:
             # Check if the appliance was already trained. If not then create a new model for it
             if appliance_name not in self.models:
-                print("First model training for", appliance_name)
+                _log_print("First model training for", appliance_name)
                 self.models[appliance_name] = self.return_network()
             # Retrain the particular appliance
             else:
-                print("Started Retraining model for", appliance_name)
+                _log_print("Started Retraining model for", appliance_name)
 
             model = self.models[appliance_name]
             if train_main.size > 0:
                 # Sometimes chunks can be empty after dropping NANS
                 if len(train_main) > 10:
                     # Do validation when you have sufficient samples
-                    filepath = self.file_prefix + "-{}-epoch{}.h5".format(
-                            "_".join(appliance_name.split()),
-                            current_epoch,
-                    )
-                    checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1,save_best_only=True,mode='min')
+                    filepath = checkpoint_path(".h5")
+                    checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1 if self.verbose else 0,save_best_only=True,mode='min')
+                    split = train_validation_split(train_main, power, validation_fraction=0.15, strategy='tail', allow_no_validation=True)
+                    if not split.metadata.should_train:
+                        continue
                     model.fit(
-                            train_main, power,
-                            validation_split=0.15,
+                            split.X_train, split.y_train,
+                            validation_data=(split.X_val, split.y_val) if split.metadata.validation_enabled else None,
                             epochs=self.n_epochs,
                             batch_size=self.batch_size,
-                            callbacks=[checkpoint],
+                            callbacks=[checkpoint] if split.metadata.validation_enabled else [],
+                            verbose=1 if self.verbose else 0,
                     )
-                    model.load_weights(filepath)
+                    if split.metadata.validation_enabled and filepath.exists():
+                        model.load_weights(filepath)
 
                     
     def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
@@ -145,7 +153,7 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                     app_mean = self.appliance_params[app_name]['mean']
                     app_std = self.appliance_params[app_name]['std']
                 else:
-                    print ("Parameters for ", app_name ," were not found!")
+                    _log_print("Parameters for ", app_name ," were not found!")
                     raise ApplianceNotFoundError()
 
                 processed_appliance_dfs = []
@@ -176,10 +184,10 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
     def set_appliance_params(self,train_appliances):
         # Find the parameters using the first
         for (app_name,df_list) in train_appliances:
-            l = np.array(pd.concat(df_list,axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
+            values = np.array(pd.concat(df_list,axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
             if app_std<1:
                 app_std = 100
             self.appliance_params.update({app_name:{'mean':app_mean,'std':app_std}})
-        print (self.appliance_params)
+        _log_print(self.appliance_params)
diff --git a/nilmtk_contrib/disaggregate/seq2seq.py b/nilmtk_contrib/disaggregate/seq2seq.py
index c1245b0..465d02d 100644
--- a/nilmtk_contrib/disaggregate/seq2seq.py
+++ b/nilmtk_contrib/disaggregate/seq2seq.py
@@ -7,6 +7,11 @@
 from tensorflow.keras.models import Sequential
 
 
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+from nilmtk_contrib.utils.validation import train_validation_split
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 class SequenceLengthError(Exception):
     pass
 
@@ -18,6 +23,7 @@ class ApplianceNotFoundError(Exception):
 class Seq2Seq(Disaggregator):
 
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "tensorflow"))
 
         self.MODEL_NAME = "Seq2Seq"
         self.file_prefix = "{}-temp-weights".format(self.MODEL_NAME.lower())
@@ -30,11 +36,11 @@ def __init__(self, params):
         self.batch_size = params.get('batch_size',512)
         self.appliance_params = params.get('appliance_params',{})
         if self.sequence_length%2==0:
-            print ("Sequence length should be odd!")
+            _log_print("Sequence length should be odd!")
             raise (SequenceLengthError)
 
     def partial_fit(self, train_main, train_appliances, do_preprocessing=True, current_epoch=0, **load_kwargs):
-        print("...............Seq2Seq partial_fit running...............")
+        _log_print("...............Seq2Seq partial_fit running...............")
         if len(self.appliance_params) == 0:
             self.set_appliance_params(train_appliances)
 
@@ -53,29 +59,31 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, curre
         train_appliances = new_train_appliances
         for appliance_name, power in train_appliances:
             if appliance_name not in self.models:
-                print("First model training for ", appliance_name)
+                _log_print("First model training for ", appliance_name)
                 self.models[appliance_name] = self.return_network()
             else:
-                print("Started Retraining model for ", appliance_name)
+                _log_print("Started Retraining model for ", appliance_name)
 
             model = self.models[appliance_name]
             if train_main.size > 0:
                 # Sometimes chunks can be empty after dropping NANS
                 if len(train_main) > 10:
                     # Do validation when you have sufficient samples
-                    filepath = self.file_prefix + "-{}-epoch{}.h5".format(
-                            "_".join(appliance_name.split()),
-                            current_epoch,
-                    )
-                    checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1,save_best_only=True,mode='min')
+                    filepath = checkpoint_path(".h5")
+                    checkpoint = ModelCheckpoint(filepath,monitor='val_loss',verbose=1 if self.verbose else 0,save_best_only=True,mode='min')
+                    split = train_validation_split(train_main, power, validation_fraction=0.15, strategy='tail', allow_no_validation=True)
+                    if not split.metadata.should_train:
+                        continue
                     model.fit(
-                            train_main, power,
-                            validation_split=.15,
+                            split.X_train, split.y_train,
+                            validation_data=(split.X_val, split.y_val) if split.metadata.validation_enabled else None,
                             epochs=self.n_epochs,
                             batch_size=self.batch_size,
-                            callbacks=[ checkpoint ],
+                            callbacks=[checkpoint] if split.metadata.validation_enabled else [],
+                            verbose=1 if self.verbose else 0,
                     )
-                    model.load_weights(filepath)
+                    if split.metadata.validation_enabled and filepath.exists():
+                        model.load_weights(filepath)
 
                     
     def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
@@ -104,14 +112,14 @@ def disaggregate_chunk(self,test_main_list,model=None,do_preprocessing=True):
                 # the sum_arr keeps the number of times a particular timestamp has occured
                 # the predictions are summed for  agiven time, and is divided by the number of times it has occured
                 
-                l = self.sequence_length
-                n = len(prediction) + l - 1
+                window_length = self.sequence_length
+                n = len(prediction) + window_length - 1
                 sum_arr = np.zeros((n))
                 counts_arr = np.zeros((n))
-                o = len(sum_arr)
+                len(sum_arr)
                 for i in range(len(prediction)):
-                    sum_arr[i:i + l] += prediction[i].flatten()
-                    counts_arr[i:i + l] += 1
+                    sum_arr[i:i + window_length] += prediction[i].flatten()
+                    counts_arr[i:i + window_length] += 1
                 for i in range(len(sum_arr)):
                     sum_arr[i] = sum_arr[i] / counts_arr[i]
 
@@ -165,7 +173,7 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                     app_mean = self.appliance_params[app_name]['mean']
                     app_std = self.appliance_params[app_name]['std']
                 else:
-                    print ("Parameters for ", app_name ," were not found!")
+                    _log_print("Parameters for ", app_name ," were not found!")
                     raise ApplianceNotFoundError()
 
 
@@ -180,7 +188,7 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                     
                 appliance_list.append((app_name, processed_app_dfs))
                 #new_app_readings = np.array([ new_app_readings[i:i+n] for i in range(len(new_app_readings)-n+1) ])
-                #print (new_mains.shape, new_app_readings.shape, app_name)
+                #_log_print(new_mains.shape, new_app_readings.shape, app_name)
 
             return processed_mains_lst, appliance_list
 
@@ -200,9 +208,9 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
     def set_appliance_params(self,train_appliances):
 
         for (app_name,df_list) in train_appliances:
-            l = np.array(pd.concat(df_list,axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
+            values = np.array(pd.concat(df_list,axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
             if app_std<1:
                 app_std = 100
             self.appliance_params.update({app_name:{'mean':app_mean,'std':app_std}})
diff --git a/nilmtk_contrib/mains_stats.py b/nilmtk_contrib/mains_stats.py
new file mode 100644
index 0000000..9f217b0
--- /dev/null
+++ b/nilmtk_contrib/mains_stats.py
@@ -0,0 +1,113 @@
+"""Utilities for calculating mains statistics across NILMTK buildings."""
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def _empty_stats(ac_type):
+    return {
+        "mean": 0,
+        "std": 0,
+        "min": 0,
+        "max": 0,
+        "data_points": 0,
+        "ac_type": ac_type,
+    }
+
+
+def calculate_multi_building_mains_stats(
+    dataset_path,
+    building_ids,
+    start_time,
+    end_time,
+    ac_type="active",
+    sample_period=60,
+    verbose=False,
+):
+    """Calculate mains statistics across multiple buildings.
+
+    NILMTK is imported only when this function is called so importing this
+    module stays cheap and does not access datasets.
+    """
+    import pandas as pd
+    from nilmtk import DataSet
+
+    ds = DataSet(dataset_path)
+    try:
+        ds.set_window(start=start_time, end=end_time)
+        all_mains_data = []
+
+        for building_id in building_ids:
+            if verbose:
+                logger.info("Processing Building %s...", building_id)
+            try:
+                mains = ds.buildings[building_id].elec.mains()
+                power_data = mains.power_series_all_data(
+                    ac_type=ac_type,
+                    sample_period=sample_period,
+                )
+
+                if power_data is not None and not power_data.empty:
+                    all_mains_data.append(power_data)
+                elif verbose:
+                    logger.info(
+                        "No data found for Building %s in the specified timeframe.",
+                        building_id,
+                    )
+            except KeyError:
+                if verbose:
+                    logger.info("Building %s not found in the dataset.", building_id)
+            except Exception:
+                if verbose:
+                    logger.exception("Failed to process Building %s.", building_id)
+                else:
+                    logger.debug(
+                        "Failed to process Building %s.",
+                        building_id,
+                        exc_info=True,
+                    )
+
+        if not all_mains_data:
+            if verbose:
+                logger.info("Could not retrieve data for any specified buildings.")
+            return _empty_stats(ac_type)
+
+        if verbose:
+            logger.info("Combining data from all buildings.")
+        clean_data = pd.concat(all_mains_data).dropna()
+
+        return {
+            "mean": clean_data.mean(),
+            "std": clean_data.std(),
+            "min": clean_data.min(),
+            "max": clean_data.max(),
+            "data_points": len(clean_data),
+            "ac_type": ac_type,
+        }
+    finally:
+        store = getattr(ds, "store", None)
+        if store is not None:
+            store.close()
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    stats = calculate_multi_building_mains_stats(
+        dataset_path="/home/ubuntu/downloads/refit.h5",
+        building_ids=[2],
+        start_time="2014-04-01",
+        end_time="2014-04-30",
+        ac_type="active",
+        sample_period=60,
+        verbose=True,
+    )
+
+    logger.info("--- Combined Mains Statistics ---")
+    if stats["data_points"] > 0:
+        logger.info("Combined Mains Mean: %.2fW", stats["mean"])
+        logger.info("Combined Mains Std: %.2fW", stats["std"])
+        logger.info("Data Range: %.2fW to %.2fW", stats["min"], stats["max"])
+        logger.info("Total Data Points from all buildings: %s", stats["data_points"])
+    else:
+        logger.info("No data available to calculate statistics.")
diff --git a/nilmtk_contrib/preprocessing/__init__.py b/nilmtk_contrib/preprocessing/__init__.py
new file mode 100644
index 0000000..d0a0de1
--- /dev/null
+++ b/nilmtk_contrib/preprocessing/__init__.py
@@ -0,0 +1,20 @@
+"""Shared preprocessing helpers for NILM models."""
+
+from nilmtk_contrib.preprocessing.alignment import restore_index
+from nilmtk_contrib.preprocessing.classification import make_on_off_labels
+from nilmtk_contrib.preprocessing.normalization import denormalize, normalize
+from nilmtk_contrib.preprocessing.windows import (
+    make_sliding_windows,
+    overlap_average,
+    sequence_to_point_targets,
+)
+
+__all__ = [
+    "denormalize",
+    "make_on_off_labels",
+    "make_sliding_windows",
+    "normalize",
+    "overlap_average",
+    "restore_index",
+    "sequence_to_point_targets",
+]
diff --git a/nilmtk_contrib/preprocessing/alignment.py b/nilmtk_contrib/preprocessing/alignment.py
new file mode 100644
index 0000000..986b1bb
--- /dev/null
+++ b/nilmtk_contrib/preprocessing/alignment.py
@@ -0,0 +1,21 @@
+"""Index alignment helpers."""
+
+import pandas as pd
+
+
+def restore_index(predictions, original_index):
+    """Return a pandas object indexed like the original signal."""
+    if len(predictions) != len(original_index):
+        raise ValueError("predictions and original_index must have the same length.")
+
+    if isinstance(predictions, pd.DataFrame):
+        restored = predictions.copy()
+        restored.index = original_index
+        return restored
+
+    if isinstance(predictions, pd.Series):
+        restored = predictions.copy()
+        restored.index = original_index
+        return restored
+
+    return pd.Series(predictions, index=original_index)
diff --git a/nilmtk_contrib/preprocessing/classification.py b/nilmtk_contrib/preprocessing/classification.py
new file mode 100644
index 0000000..32f3edb
--- /dev/null
+++ b/nilmtk_contrib/preprocessing/classification.py
@@ -0,0 +1,48 @@
+"""Classification label helpers."""
+
+import numpy as np
+
+
+def make_on_off_labels(values, threshold):
+    """Create binary on/off labels using an explicit power threshold."""
+    if threshold is None:
+        raise ValueError("threshold must be explicit.")
+    return (np.asarray(values) >= threshold).astype(int)
+
+
+def appliance_threshold(appliance_params, appliance_name, default_threshold=None):
+    """Return an explicit on/off threshold for one appliance."""
+    params = appliance_params.get(appliance_name, {}) if appliance_params else {}
+    threshold = params.get("on_power_threshold", params.get("threshold", default_threshold))
+    if threshold is None:
+        raise ValueError(f"Missing on/off threshold for appliance {appliance_name!r}.")
+    return threshold
+
+
+def classification_metadata(appliance_params, default_threshold=None):
+    """Return serializable threshold metadata for classification models."""
+    metadata = {
+        "default_threshold": default_threshold,
+        "appliances": {},
+    }
+    for appliance_name in sorted((appliance_params or {}).keys()):
+        metadata["appliances"][appliance_name] = {
+            "on_power_threshold": appliance_threshold(
+                appliance_params,
+                appliance_name,
+                default_threshold,
+            )
+        }
+    return metadata
+
+
+def loss_weight_metadata(regression_weight=1.0, classification_weight=1.0):
+    """Return serializable loss weight metadata for dual-output models."""
+    if regression_weight <= 0:
+        raise ValueError("regression_weight must be positive.")
+    if classification_weight <= 0:
+        raise ValueError("classification_weight must be positive.")
+    return {
+        "regression": regression_weight,
+        "classification": classification_weight,
+    }
diff --git a/nilmtk_contrib/preprocessing/normalization.py b/nilmtk_contrib/preprocessing/normalization.py
new file mode 100644
index 0000000..e51e08b
--- /dev/null
+++ b/nilmtk_contrib/preprocessing/normalization.py
@@ -0,0 +1,34 @@
+"""Normalization helpers."""
+
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass(frozen=True)
+class NormalizationMetadata:
+    mean: float
+    requested_std: float
+    std_used: float
+
+
+def normalize(values, mean, std, min_std=1, fallback_std=100):
+    """Normalize values without dividing by zero or tiny std values."""
+    std_used = std
+    if std_used is None or abs(std_used) < min_std:
+        std_used = fallback_std
+    if std_used == 0:
+        std_used = fallback_std
+
+    normalized = (np.asarray(values) - mean) / std_used
+    metadata = NormalizationMetadata(
+        mean=mean,
+        requested_std=std,
+        std_used=std_used,
+    )
+    return normalized, metadata
+
+
+def denormalize(values, mean, std):
+    """Undo simple z-score normalization."""
+    return mean + np.asarray(values) * std
diff --git a/nilmtk_contrib/preprocessing/windows.py b/nilmtk_contrib/preprocessing/windows.py
new file mode 100644
index 0000000..e968a50
--- /dev/null
+++ b/nilmtk_contrib/preprocessing/windows.py
@@ -0,0 +1,115 @@
+"""Windowing and sequence reconstruction helpers."""
+
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass(frozen=True)
+class WindowMetadata:
+    original_length: int
+    window_length: int
+    pad: str
+    pad_left: int
+    pad_right: int
+    pad_value: float
+    trim_slice: tuple[int, int]
+
+
+def _as_1d(values):
+    return np.asarray(values).reshape(-1)
+
+
+def _windows_from_padded(values, window_length):
+    if len(values) < window_length:
+        return np.empty((0, window_length), dtype=values.dtype)
+    return np.lib.stride_tricks.sliding_window_view(values, window_length).copy()
+
+
+def make_sliding_windows(values, window_length, pad="center", pad_value=0):
+    """Create sliding windows with explicit padding metadata."""
+    if not isinstance(window_length, int) or window_length <= 0:
+        raise ValueError("window_length must be a positive integer.")
+    if pad not in {"center", "right", "none"}:
+        raise ValueError("pad must be one of 'center', 'right', or 'none'.")
+
+    flat = _as_1d(values)
+    original_length = len(flat)
+
+    if pad == "center":
+        total_pad = window_length - 1
+        pad_left = total_pad // 2
+        pad_right = total_pad - pad_left
+    elif pad == "right":
+        pad_left = 0
+        pad_right = window_length - 1
+    else:
+        pad_left = 0
+        pad_right = 0
+
+    padded = np.pad(
+        flat,
+        (pad_left, pad_right),
+        mode="constant",
+        constant_values=pad_value,
+    )
+    windows = _windows_from_padded(padded, window_length)
+    metadata = WindowMetadata(
+        original_length=original_length,
+        window_length=window_length,
+        pad=pad,
+        pad_left=pad_left,
+        pad_right=pad_right,
+        pad_value=pad_value,
+        trim_slice=(pad_left, pad_left + original_length),
+    )
+    return windows, metadata
+
+
+def sequence_to_point_targets(appliance_values, window_length, center=True):
+    """Create sequence-to-point targets from appliance readings."""
+    flat = _as_1d(appliance_values)
+    if not center:
+        if len(flat) < window_length:
+            return np.asarray([], dtype=flat.dtype)
+        return flat[window_length - 1 :]
+
+    windows, _ = make_sliding_windows(flat, window_length, pad="center")
+    center_index = window_length // 2
+    return windows[:, center_index]
+
+
+def overlap_average(windows, original_length, trim=True):
+    """Average overlapping sequence windows back to a single 1D signal."""
+    arr = np.asarray(windows)
+    if arr.ndim != 2:
+        raise ValueError("windows must be a 2D array.")
+    if original_length < 0:
+        raise ValueError("original_length must be non-negative.")
+    if arr.size == 0:
+        return np.asarray([], dtype=arr.dtype)
+
+    window_count, window_length = arr.shape
+    output_length = window_count + window_length - 1
+    totals = np.zeros(output_length, dtype=float)
+    counts = np.zeros(output_length, dtype=float)
+
+    for start, window in enumerate(arr):
+        stop = start + window_length
+        totals[start:stop] += window
+        counts[start:stop] += 1
+
+    averaged = totals / np.maximum(counts, 1)
+    if not trim:
+        return averaged
+
+    if len(averaged) == original_length:
+        return averaged
+
+    excess = len(averaged) - original_length
+    if excess <= 0:
+        return averaged[:original_length]
+
+    trim_left = excess // 2
+    trim_right = trim_left + original_length
+    return averaged[trim_left:trim_right]
diff --git a/nilmtk_contrib/torch/TCN.py b/nilmtk_contrib/torch/TCN.py
new file mode 100644
index 0000000..0978ffe
--- /dev/null
+++ b/nilmtk_contrib/torch/TCN.py
@@ -0,0 +1,418 @@
+from collections import OrderedDict
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import TensorDataset, DataLoader
+from nilmtk.disaggregate import Disaggregator
+
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
+class SequenceLengthError(Exception):
+    pass
+
+class ApplianceNotFoundError(Exception):
+    pass
+
+class TemporalConvNet(nn.Module):
+    """
+    Temporal Convolutional Network (TCN) implementation.
+    This network uses a series of temporal blocks with dilated, causal convolutions 
+    to capture long-range dependencies in sequential data.
+    """
+    def __init__(self, sequence_length, num_levels=8, num_filters=25, kernel_size=7, dropout=0.2):
+        super(TemporalConvNet, self).__init__()
+        
+        self.num_levels = num_levels
+        self.num_filters = num_filters
+        
+        layers = []
+        num_channels = [1] + [num_filters] * num_levels
+        
+        for i in range(num_levels):
+            dilation_size = 2 ** i
+            in_channels = num_channels[i]
+            out_channels = num_channels[i+1]
+            
+            layers.append(TemporalBlock(
+                in_channels, 
+                out_channels, 
+                kernel_size, 
+                stride=1, 
+                dilation=dilation_size, 
+                padding=(kernel_size-1) * dilation_size, 
+                dropout=dropout
+            ))
+        
+        self.network = nn.Sequential(*layers)
+        
+        # Final fully connected layer
+        self.final_length = self._calculate_output_length(sequence_length, kernel_size, num_levels)
+        self.fc = nn.Linear(num_filters * self.final_length, 1)
+        
+        # Initialize weights
+        self._initialize_weights()
+    
+    def _calculate_output_length(self, input_length, kernel_size, num_levels):
+        """Calculates the output length after all temporal blocks."""
+        # Causal convolutions with proper padding maintain the sequence length.
+        return input_length
+    
+    def _initialize_weights(self):
+        """Initializes weights with Xavier uniform initialization."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    
+    def forward(self, x):
+        # Input shape: (batch_size, 1, sequence_length) 
+        x = self.network(x)
+        # Output shape: (batch_size, num_filters, final_length)
+        x = x.view(x.size(0), -1)  # Flatten
+        x = self.fc(x)
+        return x
+
+class TemporalBlock(nn.Module):
+    """
+    A single block of a TCN, consisting of two dilated causal convolutions
+    with a residual connection.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride, dilation, padding, dropout=0.2):
+        super(TemporalBlock, self).__init__()
+        
+        # First dilated causal convolution
+        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size,
+                              stride=stride, padding=padding, dilation=dilation)
+        
+        # Chomp1d removes padding to ensure causality.
+        self.chomp1 = Chomp1d(padding)
+        self.relu1 = nn.ReLU()
+        self.dropout1 = nn.Dropout(dropout)
+        
+        # Second dilated causal convolution  
+        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size,
+                              stride=stride, padding=padding, dilation=dilation)
+        self.chomp2 = Chomp1d(padding)
+        self.relu2 = nn.ReLU()
+        self.dropout2 = nn.Dropout(dropout)
+        
+        # Residual connection (with downsampling if channels differ)
+        self.downsample = nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else None
+        self.relu = nn.ReLU()
+        
+        # Weight normalization for stability
+        self.conv1 = nn.utils.weight_norm(self.conv1)
+        self.conv2 = nn.utils.weight_norm(self.conv2)
+        if self.downsample is not None:
+            self.downsample = nn.utils.weight_norm(self.downsample)
+        
+        self.init_weights()
+    
+    def init_weights(self):
+        """Initializes weights for the temporal block."""
+        nn.init.normal_(self.conv1.weight, 0, 0.01)
+        nn.init.normal_(self.conv2.weight, 0, 0.01)
+        if self.downsample is not None:
+            nn.init.normal_(self.downsample.weight, 0, 0.01)
+    
+    def forward(self, x):
+        # First convolution path
+        out = self.conv1(x)
+        out = self.chomp1(out)
+        out = self.relu1(out)
+        out = self.dropout1(out)
+        
+        # Second convolution path
+        out = self.conv2(out)
+        out = self.chomp2(out)
+        out = self.relu2(out)
+        out = self.dropout2(out)
+        
+        # Add residual connection
+        res = x if self.downsample is None else self.downsample(x)
+        
+        # Ensure residual and output have the same length
+        if res.size(2) != out.size(2):
+            res = res[:, :, :out.size(2)]
+        
+        return self.relu(out + res)
+
+class Chomp1d(nn.Module):
+    """
+    Removes padding from the end of a sequence to make convolutions causal.
+    """
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+    
+    def forward(self, x):
+        return x[:, :, :-self.chomp_size].contiguous() if self.chomp_size > 0 else x
+
+class TCN(Disaggregator):
+    """
+    Temporal Convolutional Network (TCN) for Non-Intrusive Load Monitoring (NILM).
+    
+    Based on "An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling"
+    by Bai et al., published in arXiv preprint arXiv:1803.01271, 2018.
+    https://arxiv.org/abs/1803.01271
+    
+    This implementation applies the TCN architecture to energy disaggregation, using dilated causal 
+    convolutions to capture long-range temporal dependencies in power consumption sequences. TCNs 
+    have been shown to outperform canonical recurrent networks like LSTMs across diverse sequence 
+    modeling tasks while demonstrating longer effective memory.
+    
+    Architecture Overview:
+    - Multiple temporal blocks with dilated causal convolutions for long-range dependencies
+    - Residual connections within each temporal block for improved gradient flow
+    - Dropout layers for regularization to prevent overfitting
+    - Sequence-to-point learning for appliance power prediction
+    - Exponentially increasing dilation factors to capture patterns at multiple time scales
+    
+    Args:
+        params (dict): Dictionary containing model hyperparameters:
+            - sequence_length (int): Length of input sequences (default: 99, must be odd)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - num_levels (int): Number of temporal blocks (default: 8)
+            - num_filters (int): Number of filters per temporal block (default: 25)
+            - kernel_size (int): Kernel size for convolutions (default: 7)
+            - dropout (float): Dropout rate for regularization (default: 0.2)
+            - appliance_params (dict): Appliance-specific normalization parameters
+            - mains_mean (float): Mean normalization for mains power (default: 1800)
+            - mains_std (float): Standard deviation for mains power (default: 600)
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
+    """
+    def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        super().__init__()
+        self.MODEL_NAME = "TCN"
+        self.models = OrderedDict()
+        self.file_prefix = f"{self.MODEL_NAME.lower()}-temp-weights"
+        
+        # Hyperparameters
+        self.chunk_wise_training = params.get("chunk_wise_training", False)
+        self.sequence_length = params.get("sequence_length", 99)
+        self.n_epochs = params.get("n_epochs", 10)
+        self.batch_size = params.get("batch_size", 512)
+        self.appliance_params = params.get("appliance_params", {})
+        self.mains_mean = params.get("mains_mean", 1800)
+        self.mains_std = params.get("mains_std", 600)
+        
+        # TCN-specific parameters
+        self.num_levels = params.get("num_levels", 8)
+        self.num_filters = params.get("num_filters", 25)
+        self.kernel_size = params.get("kernel_size", 7)
+        self.dropout = params.get("dropout", 0.2)
+        
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        
+        # Sequence length must be odd for centered windowing.
+        if self.sequence_length % 2 == 0:
+            _log_print("Sequence length should be odd!")
+            raise SequenceLengthError
+
+        _log_print(f"TCN initialized with sequence_length={self.sequence_length}")
+        _log_print(f"TCN params: levels={self.num_levels}, filters={self.num_filters}, kernel_size={self.kernel_size}")
+        _log_print(f"Using device: {self.device}")
+
+    def return_network(self):
+        """Builds and returns the TCN network."""
+        model = TemporalConvNet(
+            sequence_length=self.sequence_length,
+            num_levels=self.num_levels,
+            num_filters=self.num_filters,
+            kernel_size=self.kernel_size,
+            dropout=self.dropout
+        ).to(self.device)
+        
+        # Count parameters
+        total_params = sum(p.numel() for p in model.parameters())
+        _log_print(f"TCN model created with {total_params:,} parameters")
+        
+        return model
+
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """Preprocesses data using a sliding window approach."""
+        if method == 'train':
+            # Preprocess training data
+            mains_df_list = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                mains_df_list.append(pd.DataFrame(new_mains))
+
+            appliance_list = []
+            for app_index, (app_name, app_df_list) in enumerate(submeters_lst):
+                if app_name in self.appliance_params:
+                    app_mean = self.appliance_params[app_name]['mean']
+                    app_std = self.appliance_params[app_name]['std']
+                else:
+                    raise ApplianceNotFoundError(f"Parameters for appliance '{app_name}' not found!")
+
+                processed_appliance_dfs = []
+                for app_df in app_df_list:
+                    new_app_readings = app_df.values.reshape((-1, 1))
+                    new_app_readings = (new_app_readings - app_mean) / app_std  
+                    processed_appliance_dfs.append(pd.DataFrame(new_app_readings))
+                appliance_list.append((app_name, processed_appliance_dfs))
+            return mains_df_list, appliance_list
+        
+        else: # method == 'test'
+            # Preprocess test data
+            mains_df_list = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                mains_df_list.append(pd.DataFrame(new_mains))
+            return mains_df_list
+
+    def set_appliance_params(self, train_appliances):
+        """Computes and sets normalization parameters for each appliance."""
+        for app_name, df_list in train_appliances:
+            values = np.array(pd.concat(df_list, axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            if app_std < 1:
+                app_std = 100
+            self.appliance_params.update({app_name: {'mean': app_mean, 'std': app_std}})
+        _log_print("Appliance parameters set:", self.appliance_params)
+
+    def partial_fit(self, train_main, train_appliances, do_preprocessing=True, current_epoch=0, **load_kwargs):
+        """Trains the model on a chunk of data."""
+        # Compute appliance parameters if not already set
+        if not self.appliance_params:
+            self.set_appliance_params(train_appliances)
+
+        _log_print("...............TCN partial_fit running...............")
+        # Preprocess data
+        if do_preprocessing:
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+
+        train_main = pd.concat(train_main, axis=0)
+        train_main = train_main.values.reshape((-1, self.sequence_length, 1))
+        new_train_appliances = []
+        for app_name, app_df in train_appliances:
+            app_df = pd.concat(app_df, axis=0)
+            app_df_values = app_df.values.reshape((-1, 1))
+            new_train_appliances.append((app_name, app_df_values))
+        train_appliances = new_train_appliances
+
+        for appliance_name, power in train_appliances:
+            # Create a new model for the appliance if it's the first time training
+            if appliance_name not in self.models:
+                _log_print("First time training for", appliance_name)
+                self.models[appliance_name] = self.return_network()
+            else:
+                _log_print("Retraining model for", appliance_name)
+
+            model = self.models[appliance_name]
+            if train_main.size > 0 and len(train_main) > 10:
+                    # Convert to tensors
+                    # Conv1d expects (batch, channels, length)
+                    train_main_tensor = torch.tensor(train_main, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+                    power_tensor = torch.tensor(power, dtype=torch.float32).squeeze().to(self.device)
+                    
+                    # Create validation split (15%)
+                    n_samples = train_main_tensor.size(0)
+                    val_size = max(1, int(0.15 * n_samples)) if n_samples > 1 else 0
+                    indices = torch.randperm(n_samples)
+                    train_idx, val_idx = indices[val_size:], indices[:val_size]
+                    
+                    train_X = train_main_tensor[train_idx]
+                    train_y = power_tensor[train_idx]
+                    val_X = train_main_tensor[val_idx]
+                    val_y = power_tensor[val_idx]
+                    
+                    # Setup optimizer and loss function
+                    optimizer = torch.optim.Adam(model.parameters())
+                    criterion = nn.MSELoss()
+                    
+                    best_val_loss = float('inf')
+                    filepath = checkpoint_path(".pth")
+                    
+                    # Training loop
+                    for epoch in range(self.n_epochs):
+                        model.train()
+                        
+                        # Create data loader for batching
+                        train_dataset = TensorDataset(train_X, train_y)
+                        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
+                        
+                        epoch_losses = []
+                        for batch_X, batch_y in train_loader:
+                            optimizer.zero_grad()
+                            predictions = model(batch_X).squeeze()
+                            loss = criterion(predictions, batch_y)
+                            loss.backward()
+                            
+                            # Gradient clipping to prevent exploding gradients
+                            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                            
+                            optimizer.step()
+                            epoch_losses.append(loss.item())
+                        
+                        # Validation at the end of each epoch
+                        model.eval()
+                        with torch.no_grad():
+                            val_predictions = model(val_X).squeeze()
+                            val_loss = criterion(val_predictions, val_y).item()
+                        
+                        avg_train_loss = np.mean(epoch_losses)
+                        _log_print(f"Epoch {epoch+1}/{self.n_epochs} - loss: {avg_train_loss:.4f} - val_loss: {val_loss:.4f}")
+                        
+                        # Save the best model based on validation loss
+                        if val_loss < best_val_loss:
+                            best_val_loss = val_loss
+                            torch.save(model.state_dict(), filepath)
+                            _log_print(f"Validation loss improved, saving model to {filepath}")
+                    
+                    # Load the best weights after training
+                    model.load_state_dict(torch.load(filepath, map_location=self.device))
+
+    def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
+        """Disaggregates a chunk of mains data."""
+        if model is not None:
+            self.models = model
+
+        # Preprocess test data
+        if do_preprocessing:
+            test_main_list = self.call_preprocessing(test_main_list, submeters_lst=None, method='test')
+
+        test_predictions = []
+        for test_main in test_main_list:
+            test_main = test_main.values
+            test_main = test_main.reshape((-1, self.sequence_length, 1))
+            
+            # Convert to tensor for Conv1d
+            test_main_tensor = torch.tensor(test_main, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+            
+            disggregation_dict = {}
+            for appliance in self.models:
+                model = self.models[appliance]
+                model.eval()
+                with torch.no_grad():
+                    prediction = model(test_main_tensor).cpu().numpy()
+                    # Denormalize predictions
+                    app_mean = self.appliance_params[appliance]['mean']
+                    app_std = self.appliance_params[appliance]['std']
+                    prediction = prediction * app_std + app_mean
+                    valid_predictions = prediction.flatten()
+                    valid_predictions[valid_predictions < 0] = 0
+                    df = pd.Series(valid_predictions)
+                    disggregation_dict[appliance] = df
+            results = pd.DataFrame(disggregation_dict, dtype='float32')
+            test_predictions.append(results)
+        return test_predictions
\ No newline at end of file
diff --git a/nilmtk_contrib/torch/WindowGRU.py b/nilmtk_contrib/torch/WindowGRU.py
index d1ee2ef..58c2653 100644
--- a/nilmtk_contrib/torch/WindowGRU.py
+++ b/nilmtk_contrib/torch/WindowGRU.py
@@ -1,259 +1,365 @@
 import torch
 import torch.nn as nn
-import torch.optim as optim
 from torch.utils.data import DataLoader, TensorDataset
 from collections import OrderedDict
 import numpy as np
 import pandas as pd
-from tqdm import tqdm
 from nilmtk.disaggregate import Disaggregator
 
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
+class FastReLUGRU(nn.Module):
+    """
+    Fast implementation using standard PyTorch GRU with post-processing to approximate
+    ReLU activation behavior. This is much faster while maintaining similar performance.
+    """
+    def __init__(self, input_size, hidden_size, batch_first=True, bidirectional=False, return_sequences=True):
+        super(FastReLUGRU, self).__init__()
+        self.return_sequences = return_sequences
+        
+        # Use standard PyTorch GRU for speed
+        self.gru = nn.GRU(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            batch_first=batch_first,
+            bidirectional=bidirectional
+        )
+        
+        # Apply transformation to approximate ReLU activation effect
+        # This linear layer helps adjust the tanh outputs to be more ReLU-like
+        output_size = hidden_size * 2 if bidirectional else hidden_size
+        self.activation_transform = nn.Sequential(
+            nn.Linear(output_size, output_size),
+            nn.ReLU(),
+            nn.Linear(output_size, output_size)
+        )
+    
+    def forward(self, input, h0=None):
+        # Fast GRU computation
+        if self.return_sequences:
+            output, final_h = self.gru(input, h0)
+            # Apply transformation to make it more ReLU-like
+            batch_size, seq_len, hidden_size = output.shape
+            output_reshaped = output.reshape(-1, hidden_size)
+            transformed = self.activation_transform(output_reshaped)
+            output = transformed.reshape(batch_size, seq_len, hidden_size)
+            return output, final_h
+        else:
+            # Only need final hidden state
+            _, final_h = self.gru(input, h0)
+            if final_h.dim() == 3:  # [num_layers, batch, hidden] -> [batch, hidden]
+                if final_h.size(0) == 2:  # bidirectional
+                    final_h = torch.cat([final_h[0], final_h[1]], dim=1)
+                else:
+                    final_h = final_h.squeeze(0)
+            # Transform final hidden state
+            final_h = self.activation_transform(final_h)
+            return None, final_h
+
 class GRUNet(nn.Module):
     """
-    Neural network combining 1D CNN feature extraction with bidirectional GRU layers
-    for sequence-to-point NILM disaggregation.
+    Neural network intended to align with the TensorFlow WindowGRU architecture.
     """
     def __init__(self, sequence_length):
         super(GRUNet, self).__init__()
-        # 1D CNN for initial feature extraction
-        self.conv1    = nn.Conv1d(1, 16, kernel_size=4, padding=2)
+        # 1D CNN with same padding as TF (padding="same")
+        self.conv1 = nn.Conv1d(1, 16, kernel_size=4, padding=2, stride=1)
         
-        # Bidirectional GRU layers for sequence modeling
-        self.gru1     = nn.GRU(16, 64, batch_first=True, bidirectional=True)
+        # Bidirectional Fast ReLU GRU layers (much faster than custom cells)
+        # First GRU: return_sequences=True (matches TF)
+        self.gru1 = FastReLUGRU(16, 64, batch_first=True, bidirectional=True, return_sequences=True)
         self.dropout1 = nn.Dropout(0.5)
-        self.gru2     = nn.GRU(128, 128, batch_first=True, bidirectional=True)
+        
+        # Second GRU: return_sequences=False (matches TF)
+        self.gru2 = FastReLUGRU(128, 128, batch_first=True, bidirectional=True, return_sequences=False)
         self.dropout2 = nn.Dropout(0.5)
         
-        # Final layers for single value prediction
-        self.fc1      = nn.Linear(256, 128)
+        # Fully Connected Layers matching TF
+        self.fc1 = nn.Linear(256, 128)  # 256 = 128*2 (bidirectional)
         self.dropout3 = nn.Dropout(0.5)
-        self.fc2      = nn.Linear(128, 1)
+        self.fc2 = nn.Linear(128, 1)
+        
+        # Initialize weights to match TensorFlow defaults
+        self._init_weights()
+
+    def _init_weights(self):
+        """Initialize weights to match TensorFlow defaults"""
+        for name, param in self.named_parameters():
+            if 'weight_ih' in name or 'weight_hh' in name:
+                # GRU weights - use xavier/glorot uniform like TF
+                nn.init.xavier_uniform_(param)
+            elif 'bias_ih' in name or 'bias_hh' in name:
+                # GRU biases
+                nn.init.zeros_(param)
+            elif 'activation_transform' in name and 'weight' in name:
+                # Transformation layer weights
+                nn.init.xavier_uniform_(param)
+            elif 'activation_transform' in name and 'bias' in name:
+                # Transformation layer biases
+                nn.init.zeros_(param)
+            elif 'weight' in name and 'conv1' in name:
+                # Conv1D weights
+                nn.init.xavier_uniform_(param)
+            elif 'bias' in name and 'conv1' in name:
+                # Conv1D bias
+                nn.init.zeros_(param)
+            elif 'fc' in name and 'weight' in name:
+                # Dense layer weights
+                nn.init.xavier_uniform_(param)
+            elif 'fc' in name and 'bias' in name:
+                # Dense layer biases
+                nn.init.zeros_(param)
 
     def forward(self, x):
-        # Extract features using 1D convolution
+        # 1D Conv with ReLU activation (matching TF)
         x = self.conv1(x)           # [batch, 1, seq_len] -> [batch, 16, seq_len]
         x = torch.relu(x)
         x = x.permute(0, 2, 1)      # Rearrange for GRU: [batch, seq_len, 16]
         
-        # Process through bidirectional GRU layers
-        x, _   = self.gru1(x)       # [batch, seq_len, 128]
-        x      = self.dropout1(x)
-        _, h_n = self.gru2(x)       # h_n: [2, batch, 128] (final hidden states)
+        # First bidirectional ReLU GRU with return_sequences=True
+        x, _ = self.gru1(x)         # [batch, seq_len, 128] (64*2)
+        x = self.dropout1(x)
         
-        # Combine forward and backward final states
-        h      = torch.cat([h_n[-2], h_n[-1]], dim=1)  # [batch, 256]
-        h      = self.dropout2(h)
+        # Second bidirectional ReLU GRU with return_sequences=False (only final state)
+        _, h_n = self.gru2(x)       # h_n: [batch, 256] (128*2 concatenated final states)
+        h = self.dropout2(h_n)
         
-        # Final prediction layers
-        h      = self.fc1(h)        # [batch, 128]
-        h      = torch.relu(h)
-        h      = self.dropout3(h)
-        out    = self.fc2(h)        # [batch, 1]
+        # Dense layers with ReLU and linear activation
+        h = self.fc1(h)             # [batch, 128]
+        h = torch.relu(h)
+        h = self.dropout3(h)
+        out = self.fc2(h)           # [batch, 1] - linear activation (no activation)
         return out
 
 class WindowGRU(Disaggregator):
     """
-    NILM disaggregator using windowed GRU approach with custom preprocessing.
-    Uses sliding windows and GRU networks for appliance disaggregation.
+    Window-based GRU neural network for Non-Intrusive Load Monitoring (NILM).
+    
+    Based on "Sliding window approach for online energy disaggregation using artificial neural networks"
+    by Krystalakos et al., published in Proceedings of the 10th Hellenic Conference on Artificial Intelligence, 2018.
+    DOI: https://doi.org/10.1145/3200947.3201011
+    
+    This implementation uses a sliding window approach for real-time energy disaggregation,
+    employing recurrent neural networks with Gated Recurrent Units (GRUs) for temporal 
+    pattern recognition in power consumption data.
+    
+    Architecture Overview:
+    - 1D convolutional layer for initial feature extraction from power sequences
+    - Two bidirectional GRU layers with ReLU activation for temporal sequence modeling
+    - Dropout layers for regularization to prevent overfitting
+    - Fully connected layers for final power consumption prediction
+    - Sliding window approach for online, real-time energy disaggregation
+    
+    Args:
+        params (dict): Dictionary containing model hyperparameters:
+            - sequence_length (int): Length of input sequences (default: 99)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - save-model-path (str): Path to save trained models (optional)
+            - pretrained-model-path (str): Path to load pre-trained models (optional)
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
     """
     def __init__(self, params):
-        super().__init__()
-        self.MODEL_NAME      = "WindowGRU"
-        self.file_prefix     = f"{self.MODEL_NAME.lower()}-temp-weights"
-        
-        # Extract hyperparameters
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        self.MODEL_NAME = "WindowGRU"
+        self.file_prefix = "{}-temp-weights".format(self.MODEL_NAME.lower())
         self.save_model_path = params.get('save-model-path', None)
         self.load_model_path = params.get('pretrained-model-path', None)
+        self.chunk_wise_training = params.get('chunk_wise_training', False)
         self.sequence_length = params.get('sequence_length', 99)
-        self.n_epochs        = params.get('n_epochs', 10)
-        self.batch_size      = params.get('batch_size', 512)
-        self.max_val         = 800  # Normalization factor
-        self.models          = OrderedDict()  # Store separate models for each appliance
-        self.device          = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.n_epochs = params.get('n_epochs', 10)
+        self.models = OrderedDict()
+        self.max_val = 800
+        self.batch_size = params.get('batch_size', 512)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
     def return_network(self):
         """Factory method to create a new GRU model instance"""
         return GRUNet(self.sequence_length).to(self.device)
 
-    def partial_fit(self, train_main, train_appliances,
-                    do_preprocessing=True, current_epoch=0, **kwargs):
-        """Train models on a chunk of data (supports incremental learning)"""
-        
-        # Preprocess data using custom windowing approach
+    def partial_fit(self, train_main, train_appliances, do_preprocessing=True, current_epoch=0, **load_kwargs):
         if do_preprocessing:
-            train_main, train_appliances = self.call_preprocessing(
-                train_main, train_appliances, 'train'
-            )
+            train_main, train_appliances = self.call_preprocessing(train_main, train_appliances, 'train')
 
-        # Prepare main power data for training
-        mains_arr = pd.concat(train_main, axis=0).values \
-                    .reshape(-1, self.sequence_length)  # [N, seq_len]
-        
-        # Prepare appliance power data 
-        new_apps = []
-        for app_name, df_list in train_appliances:
-            concatenated = pd.concat(df_list, axis=0)
-            arr = concatenated.values.reshape(-1, 1)      # [N, 1]
-            new_apps.append((app_name, arr))
+        train_main = pd.concat(train_main, axis=0).values
+        train_main = train_main.reshape((-1, self.sequence_length, 1))
+        new_train_appliances = []
+        for app_name, app_df in train_appliances:
+            app_df = pd.concat(app_df, axis=0).values
+            app_df = app_df.reshape((-1, 1))
+            new_train_appliances.append((app_name, app_df))
 
-        # Train a separate model for each appliance
-        for app_name, arr in new_apps:
-            # Create new model if this appliance hasn't been seen before
+        train_appliances = new_train_appliances
+        for app_name, app_df in train_appliances:
             if app_name not in self.models:
+                _log_print("First model training for", app_name)
                 self.models[app_name] = self.return_network()
-            model = self.models[app_name]
+            else:
+                _log_print("Started re-training model for", app_name)
 
-            # Convert to tensors and split into train/validation
-            x_cpu = torch.tensor(mains_arr, dtype=torch.float32)
-            y_cpu = torch.tensor(arr, dtype=torch.float32)
-            split = int(len(x_cpu) * 0.85)
-
-            train_ds = TensorDataset(x_cpu[:split], y_cpu[:split])
-            val_ds   = TensorDataset(x_cpu[split:], y_cpu[split:])
-            train_loader = DataLoader(train_ds,
-                                      batch_size=self.batch_size,
-                                      shuffle=True)
-            val_loader   = DataLoader(val_ds,
-                                      batch_size=self.batch_size)
-
-            # Setup training components
+            model = self.models[app_name]
+            mains = train_main.reshape((-1, self.sequence_length, 1))
+            app_reading = app_df.reshape((-1, 1))
+            
+            filepath = checkpoint_path(".pt")
+            
+            # Convert to PyTorch tensors
+            mains_tensor = torch.tensor(mains, dtype=torch.float32).permute(0, 2, 1)  # [B, 1, seq]
+            app_tensor = torch.tensor(app_reading, dtype=torch.float32).squeeze()     # [B]
+            
+            # Use validation split like TF (last 15% instead of random split)
+            # This follows the legacy TF validation split fraction.
+            n_total = len(mains_tensor)
+            val_size = max(1, int(0.15 * n_total)) if n_total > 1 else 0
+            train_size = n_total - val_size
+            
+            train_x = mains_tensor[:train_size].to(self.device)
+            val_x = mains_tensor[train_size:].to(self.device)
+            train_y = app_tensor[:train_size].to(self.device)
+            val_y = app_tensor[train_size:].to(self.device)
+            
+            # Use Adam with TensorFlow-style default parameters.
+            optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-07, weight_decay=0.0)
             criterion = nn.MSELoss()
-            optimizer = optim.Adam(model.parameters(), lr=1e-3)
-            best_val  = float('inf')
-            ckpt_path = f"{self.file_prefix}-{app_name.replace(' ','_')}-epoch{current_epoch}.pt"
-
-            # Training loop
-            for epoch in tqdm(range(self.n_epochs),
-                              desc=f"Train {app_name}"):
+            
+            best_val_loss = float('inf')
+            
+            # Create DataLoader for training data with shuffle=True (like TF)
+            train_dataset = TensorDataset(train_x, train_y)
+            train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
+            
+            for epoch in range(self.n_epochs):
                 # Training phase
                 model.train()
-                for xb_cpu, yb_cpu in train_loader:
-                    xb = xb_cpu.unsqueeze(1).to(self.device)  # Add channel dim: [B,1,seq]
-                    yb = yb_cpu.to(self.device)               # [B,1]
+                train_loss = 0.0
+                num_batches = 0
+                
+                for batch_x, batch_y in train_loader:
                     optimizer.zero_grad()
-                    out = model(xb)                           # [B,1]
-                    loss = criterion(out, yb)
+                    outputs = model(batch_x).squeeze(-1)  # Ensure output shape matches target
+                    loss = criterion(outputs, batch_y)
                     loss.backward()
                     optimizer.step()
-                    
-                # Validation phase
+                    train_loss += loss.item()
+                    num_batches += 1
+                
+                train_loss /= num_batches
+                
+                # Validation phase (evaluate on full validation set at once)
                 model.eval()
-                val_losses = []
                 with torch.no_grad():
-                    for xb_cpu, yb_cpu in val_loader:
-                        xb = xb_cpu.unsqueeze(1).to(self.device)
-                        yb = yb_cpu.to(self.device)
-                        out = model(xb)
-                        val_losses.append(criterion(out, yb).item())
-                val_loss = sum(val_losses) / len(val_losses)
+                    val_outputs = model(val_x).squeeze(-1)
+                    val_loss = criterion(val_outputs, val_y).item()
                 
-                # Save best model based on validation loss
-                if val_loss < best_val:
-                    best_val = val_loss
-                    torch.save(model.state_dict(), ckpt_path)
-                    
-            # Load the best model weights
-            model.load_state_dict(torch.load(ckpt_path,
-                                             map_location=self.device))
-            torch.cuda.empty_cache()
-
+                # Save best model (like ModelCheckpoint in TF with verbose=1)
+                if val_loss < best_val_loss:
+                    best_val_loss = val_loss
+                    torch.save(model.state_dict(), filepath)
+                    _log_print(f'Epoch {epoch+1}/{self.n_epochs} - loss: {train_loss:.4f} - val_loss: {val_loss:.4f}')
+                
+            # Load best weights (like TF version)
+            model.load_state_dict(torch.load(filepath))
     def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
-        """Disaggregate power consumption for each appliance from aggregate mains data"""
-        
         if model is not None:
             self.models = model
-            
-        # Preprocess test data using custom windowing
+
         if do_preprocessing:
             test_main_list = self.call_preprocessing(
-                test_main_list, None, 'test'
-            )
-
-        results = []
+                test_main_list, submeters_lst=None, method='test')
         
-        # Process each chunk of test data
+        test_predictions = []
         for mains in test_main_list:
-            arr = mains.values.reshape(-1, self.sequence_length)
-            x_cpu = torch.tensor(arr, dtype=torch.float32)
-            test_loader = DataLoader(TensorDataset(x_cpu),
-                                     batch_size=self.batch_size)
-            out_dict = {}
-            
-            # Get predictions from each appliance model
-            for app_name, m in self.models.items():
-                preds = []
-                m.eval()
+            disggregation_dict = {}
+            mains = mains.values.reshape((-1, self.sequence_length, 1))
+            for appliance in self.models:
+                # Convert to tensor and process in batches
+                mains_tensor = torch.tensor(mains, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+                
+                model = self.models[appliance]
+                model.eval()
                 with torch.no_grad():
-                    for (xb_cpu,) in test_loader:
-                        xb = xb_cpu.unsqueeze(1).to(self.device)
-                        p  = m(xb).view(-1).cpu().numpy()
-                        preds.append(p)
-                        
-                # Combine predictions and denormalize
-                all_pred = np.concatenate(preds)
-                all_pred = np.clip(all_pred, 0, None) * self.max_val
-                out_dict[app_name] = pd.Series(all_pred)
-                torch.cuda.empty_cache()
+                    # Process in batches following the legacy TensorFlow behavior.
+                    predictions = []
+                    for i in range(0, len(mains_tensor), self.batch_size):
+                        batch = mains_tensor[i:i + self.batch_size]
+                        batch_pred = model(batch).cpu().numpy()
+                        predictions.append(batch_pred)
+                    prediction = np.concatenate(predictions, axis=0)
                 
-            # Combine all appliance predictions for this chunk
-            results.append(pd.DataFrame(out_dict, dtype='float32'))
-        return results
+                prediction = np.reshape(prediction, len(prediction))
+                valid_predictions = prediction.flatten()
+                valid_predictions = np.where(valid_predictions > 0, valid_predictions, 0)
+                valid_predictions = self._denormalize(valid_predictions, self.max_val)
+                df = pd.Series(valid_predictions)
+                disggregation_dict[appliance] = df
+            results = pd.DataFrame(disggregation_dict, dtype='float32')
+            test_predictions.append(results)
+        return test_predictions
 
     def call_preprocessing(self, mains_lst, submeters_lst, method):
-        """Custom preprocessing with sliding window approach"""
-        
         if method == 'train':
-            pm, apps = [], []
-            
-            # Process mains data with padding and windowing
-            for mains in mains_lst:
-                pad = [0] * (self.sequence_length - 1)
-                tmp = pd.concat([mains,
-                                 pd.DataFrame({mains.columns[0]: pad})])
-                pm.append(pd.DataFrame(self.preprocess_train_mains(tmp)))
-                
-            # Process appliance data
-            for name, lst in submeters_lst:
-                dfs = [pd.DataFrame(self.preprocess_train_appliances(df))
-                       for df in lst]
-                apps.append((name, dfs))
-            return pm, apps
+            _log_print("Training processing")
+            processed_mains = []
 
-        if method == 'test':
-            pm = []
-            
-            # Process test mains data with padding and windowing
             for mains in mains_lst:
-                pad = [0] * (self.sequence_length - 1)
-                tmp = pd.concat([mains,
-                                 pd.DataFrame({mains.columns[0]: pad})])
-                pm.append(pd.DataFrame(self.preprocess_test_mains(tmp)))
-            return pm
+                # add padding values
+                padding = [0 for i in range(0, self.sequence_length - 1)]
+                paddf = pd.DataFrame({mains.columns.values[0]: padding})
+                mains = pd.concat([mains, paddf])
+                mainsarray = self.preprocess_train_mains(mains)
+                processed_mains.append(pd.DataFrame(mainsarray))
 
-    def preprocess_train_mains(self, mains):
-        """Create sliding windows from mains data for training"""
-        arr = (mains / self.max_val).values
-        # Create sliding window indices
-        idx = (np.arange(self.sequence_length)[None, :]
-               + np.arange(len(arr) - self.sequence_length + 1)[:, None])
-        return arr[idx].reshape(-1, self.sequence_length)
+            tuples_of_appliances = []
+            for (appliance_name, app_dfs_list) in submeters_lst:
+                processed_app_dfs = []
+                for app_df in app_dfs_list:                    
+                    data = self.preprocess_train_appliances(app_df)
+                    processed_app_dfs.append(pd.DataFrame(data))
+                tuples_of_appliances.append((appliance_name, processed_app_dfs))
 
-    def preprocess_train_appliances(self, app):
-        """Normalize appliance data for training"""
-        return (app / self.max_val).values.reshape(-1, 1)
+            return processed_mains, tuples_of_appliances
+
+        if method == 'test':
+            processed_mains = []
+            for mains in mains_lst:                
+                # add padding values
+                padding = [0 for i in range(0, self.sequence_length - 1)]
+                paddf = pd.DataFrame({mains.columns.values[0]: padding})
+                mains = pd.concat([mains, paddf])
+                mainsarray = self.preprocess_test_mains(mains)
+                processed_mains.append(pd.DataFrame(mainsarray))
+
+            return processed_mains
 
     def preprocess_test_mains(self, mains):
-        """Create sliding windows from mains data for testing"""
-        arr = (mains / self.max_val).values
-        # Create sliding window indices
-        idx = (np.arange(self.sequence_length)[None, :]
-               + np.arange(len(arr) - self.sequence_length + 1)[:, None])
-        return arr[idx].reshape(-1, self.sequence_length)
+        mains = self._normalize(mains, self.max_val)
+        mainsarray = np.array(mains)
+        indexer = np.arange(self.sequence_length)[
+            None, :] + np.arange(len(mainsarray) - self.sequence_length + 1)[:, None]
+        mainsarray = mainsarray[indexer]
+        mainsarray = mainsarray.reshape((-1, self.sequence_length))
+        return pd.DataFrame(mainsarray)
+
+    def preprocess_train_appliances(self, appliance):
+        appliance = self._normalize(appliance, self.max_val)
+        appliancearray = np.array(appliance)
+        appliancearray = appliancearray.reshape((-1, 1))
+        return pd.DataFrame(appliancearray)
+
+    def preprocess_train_mains(self, mains):
+        mains = self._normalize(mains, self.max_val)
+        mainsarray = np.array(mains)
+        indexer = np.arange(self.sequence_length)[None, :] + np.arange(len(mainsarray) - self.sequence_length + 1)[:, None]
+        mainsarray = mainsarray[indexer]
+        mainsarray = mainsarray.reshape((-1, self.sequence_length))
+        return pd.DataFrame(mainsarray)
 
-    def _normalize(self, chunk, m):
-        """Normalize data by dividing by maximum value"""
-        return chunk / m
+    def _normalize(self, chunk, mmax):
+        tchunk = chunk / mmax
+        return tchunk
 
-    def _denormalize(self, chunk, m):
-        """Denormalize data by multiplying by maximum value"""
-        return chunk * m
\ No newline at end of file
+    def _denormalize(self, chunk, mmax):
+        tchunk = chunk * mmax
+        return tchunk
diff --git a/nilmtk_contrib/torch/__init__.py b/nilmtk_contrib/torch/__init__.py
index e69de29..8764c54 100644
--- a/nilmtk_contrib/torch/__init__.py
+++ b/nilmtk_contrib/torch/__init__.py
@@ -0,0 +1,59 @@
+"""Lazy exports for PyTorch NILMTK disaggregators."""
+
+from importlib import import_module
+
+from nilmtk_contrib.utils.optional_imports import OptionalDependencyError
+
+_EXPORTS = {
+    "BERT": ("nilmtk_contrib.torch.bert", "BERT"),
+    "ConvLSTM": ("nilmtk_contrib.torch.conv_lstm", "ConvLSTM"),
+    "DAE": ("nilmtk_contrib.torch.dae", "DAE"),
+    "MSDC": ("nilmtk_contrib.torch.msdc", "MSDC"),
+    "NILMFormer": ("nilmtk_contrib.torch.nilmformer", "NILMFormer"),
+    "Reformer": ("nilmtk_contrib.torch.reformer", "Reformer"),
+    "ResNet": ("nilmtk_contrib.torch.resnet", "ResNet"),
+    "ResNet_classification": (
+        "nilmtk_contrib.torch.resnet_classification",
+        "ResNet_classification",
+    ),
+    "RNN": ("nilmtk_contrib.torch.rnn", "RNN"),
+    "RNN_attention": ("nilmtk_contrib.torch.rnn_attention", "RNN_attention"),
+    "RNN_attention_classification": (
+        "nilmtk_contrib.torch.rnn_attention_classification",
+        "RNN_attention_classification",
+    ),
+    "Seq2PointTorch": ("nilmtk_contrib.torch.seq2point", "Seq2PointTorch"),
+    "Seq2Seq": ("nilmtk_contrib.torch.seq2seq", "Seq2Seq"),
+    "TCN": ("nilmtk_contrib.torch.TCN", "TCN"),
+    "WindowGRU": ("nilmtk_contrib.torch.WindowGRU", "WindowGRU"),
+}
+
+_DEPENDENCY_EXTRAS = {
+    "nilmtk": "nilm",
+    "sklearn": "classical",
+    "torch": "torch",
+    "tqdm": "torch",
+}
+
+__all__ = sorted(_EXPORTS)
+
+
+def __getattr__(name):
+    if name not in _EXPORTS:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+    module_name, class_name = _EXPORTS[name]
+    try:
+        module = import_module(module_name)
+    except ModuleNotFoundError as exc:
+        missing_package = exc.name or "required dependency"
+        install_extra = _DEPENDENCY_EXTRAS.get(missing_package, "torch")
+        message = (
+            f"{name} requires '{missing_package}'. "
+            f"Install nilmtk-contrib[{install_extra}]."
+        )
+        raise OptionalDependencyError(message) from exc
+
+    value = getattr(module, class_name)
+    globals()[name] = value
+    return value
diff --git a/nilmtk_contrib/torch/bert.py b/nilmtk_contrib/torch/bert.py
index 0684a53..f8cfaf3 100644
--- a/nilmtk_contrib/torch/bert.py
+++ b/nilmtk_contrib/torch/bert.py
@@ -1,6 +1,3 @@
-import os
-import random
-import pickle
 import numpy as np
 import pandas as pd
 import torch
@@ -8,15 +5,14 @@
 import torch.optim as optim
 from torch.utils.data import Dataset, DataLoader
 from collections import OrderedDict
-from sklearn.model_selection import train_test_split
-from warnings import warn
+from nilmtk_contrib.utils.validation import safe_train_test_split as train_test_split
 from nilmtk.disaggregate import Disaggregator
 from tqdm import tqdm  # Added for progress bars
 
-random.seed(10)
-np.random.seed(10)
-torch.manual_seed(10)
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
 
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 class SequenceLengthError(Exception):
     pass
 
@@ -37,7 +33,7 @@ class TransformerBlock(nn.Module):
     """
     def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
         super(TransformerBlock, self).__init__()
-        self.att = nn.MultiheadAttention(embed_dim, num_heads, dropout=rate)
+        self.att = nn.MultiheadAttention(embed_dim, num_heads, dropout=rate, batch_first=True)
         self.ffn = nn.Sequential(
             nn.Linear(embed_dim, ff_dim),
             nn.ReLU(),
@@ -49,7 +45,7 @@ def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
         self.dropout2 = nn.Dropout(rate)
         
     def forward(self, x):
-        # x shape: [seq_len, batch, embed_dim]
+        # x shape: [batch, seq_len, embed_dim] with batch_first=True
         attn_output, _ = self.att(x, x, x)
         attn_output = self.dropout1(attn_output)
         out1 = self.layernorm1(x + attn_output)
@@ -57,30 +53,41 @@ def forward(self, x):
         ffn_output = self.dropout2(ffn_output)
         return self.layernorm2(out1 + ffn_output)
 
-class PositionalEncoding(nn.Module):
-    def __init__(self, embed_dim, maxlen):
-        super(PositionalEncoding, self).__init__()
-        self.pos_emb = nn.Parameter(torch.randn(1, maxlen, embed_dim)) 
-
-    def forward(self, x):
-        return x + self.pos_emb  # add positional info
-
 class TokenAndPositionEmbedding(nn.Module):
     def __init__(self, maxlen, vocab_size, embed_dim):
         super(TokenAndPositionEmbedding, self).__init__()
         self.token_emb = nn.Embedding(vocab_size, embed_dim)
         self.pos_emb = nn.Embedding(maxlen, embed_dim)
-        self.maxlen = maxlen
+        self.embed_dim = embed_dim
         
     def forward(self, x):
-        positions = torch.arange(0, self.maxlen, dtype=torch.long, device=x.device)
-        positions = self.pos_emb(positions)
-        x = self.token_emb(x)
-        return x + positions
+        # x comes in as [B, seq_len, 16] from conv layer
+        batch_size, seq_len, features = x.shape
+        
+        # Convert continuous values to discrete tokens for each feature dimension
+        # Take the mean across features and discretize
+        x_mean = x.mean(dim=-1)  # [B, seq_len]
+        
+        # Scale and clamp to vocab range
+        x_tokens = torch.clamp((x_mean * 1000).long(), 0, self.token_emb.num_embeddings - 1)
+        
+        # Get position embeddings
+        positions = torch.arange(0, seq_len, dtype=torch.long, device=x.device)
+        positions = self.pos_emb(positions)  # [seq_len, embed_dim]
+        
+        # Get token embeddings
+        token_embs = self.token_emb(x_tokens)  # [B, seq_len, embed_dim]
+        
+        return token_embs + positions.unsqueeze(0)  # [B, seq_len, embed_dim]
 
 class LPpool(nn.Module):
     def __init__(self, pool_size, stride=None, padding=0):
         super(LPpool, self).__init__()
+        if stride is None:
+            stride = pool_size
+        # For 'same' padding equivalent, calculate padding size
+        if padding == 'same':
+            padding = (pool_size - 1) // 2
         self.avgpool = nn.AvgPool1d(pool_size, stride=stride, padding=padding)
         
     def forward(self, x):
@@ -104,8 +111,32 @@ def __getitem__(self, idx):
 class BERT(Disaggregator):
     """
     BERT-inspired transformer model for non-intrusive load monitoring.
+    
+    This implementation is based on the paper:
+    "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"
+    https://arxiv.org/abs/1810.04805
+    
+    The model adapts the BERT transformer architecture for energy disaggregation tasks,
+    using a sequence-to-sequence approach to predict individual appliance power consumption
+    from aggregate household power measurements.
+    
+    Architecture Overview:
+    - 1D Convolutional layer (16 filters, kernel size 4) for feature extraction
+    - LP pooling (pool size 2) for dimensionality reduction
+    - Token and position embedding layer to convert continuous values to embeddings
+    - Single transformer encoder block with multi-head self-attention
+    - Dense output layer for sequence prediction
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Length of input sequences (default: 99)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
+            - appliance_params (dict): Appliance-specific normalization parameters
     """
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
         self.MODEL_NAME = "BERT"
         self.chunk_wise_training = params.get('chunk_wise_training', False)
         self.sequence_length = params.get('sequence_length', 99)
@@ -117,35 +148,59 @@ def __init__(self, params):
         self.appliance_params = params.get('appliance_params', {})
         
         if self.sequence_length % 2 == 0:
-            print("Sequence length should be odd!")
+            _log_print("Sequence length should be odd!")
             raise SequenceLengthError
             
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         
     def return_network(self):
+        """Create the BERT-inspired module used by this backend.
+        
+        Key architectural features:
+        - Conv1D(16, 4) with 'same' padding and linear activation
+        - LPpool with pool_size=2 
+        - TokenAndPositionEmbedding applied to 16-dim features -> 32-dim embeddings
+        - Single TransformerBlock 
+        - Dense layer mapping to sequence_length output
+        """
         embed_dim = 32
         num_heads = 2
         ff_dim = 32
         vocab_size = 20000
-        maxlen = self.sequence_length
+        maxlen = 49  # After pooling, sequence length becomes 49 (99 -> 49 after pool_size=2)
         
-        model = nn.Sequential(
-            Permute(0, 2, 1),  # [B, 1, 99]
-            nn.Conv1d(1, embed_dim, 4, stride=1, padding='same'),  # [B, embed_dim, 99]
-            LPpool(pool_size=2),  # [B, embed_dim, 49]
-            Permute(0, 2, 1),  # [B, 49, embed_dim]
-            PositionalEncoding(embed_dim, 49),  # [B, 49, embed_dim]
-            TransformerBlock(embed_dim, num_heads, ff_dim),  # [B, 49, embed_dim]
-            nn.Flatten(),  # [B, 49 * embed_dim]
-            nn.Dropout(0.1),
-            nn.Linear(49 * embed_dim, self.sequence_length),
-            nn.Dropout(0.1)
-        ).to(self.device)
+        class BERTModel(nn.Module):
+            def __init__(self, embed_dim, num_heads, ff_dim, vocab_size, maxlen, sequence_length, device):
+                super(BERTModel, self).__init__()
+                self.permute1 = Permute(0, 2, 1)
+                self.conv1d = nn.Conv1d(1, 16, 4, stride=1, padding='same')
+                self.lppool = LPpool(pool_size=2)
+                self.permute2 = Permute(0, 2, 1)
+                self.token_pos_emb = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
+                self.transformer = TransformerBlock(embed_dim, num_heads, ff_dim)
+                self.flatten = nn.Flatten()
+                self.dropout1 = nn.Dropout(0.1)
+                self.linear = nn.Linear(maxlen * embed_dim, sequence_length)  # Use maxlen instead of hardcoded 49
+                self.dropout2 = nn.Dropout(0.1)
+                
+            def forward(self, x):
+                x = self.permute1(x)  # [B, 1, 99]
+                x = self.conv1d(x)    # [B, 16, 99]
+                x = self.lppool(x)    # [B, 16, 49]
+                x = self.permute2(x)  # [B, 49, 16]
+                x = self.token_pos_emb(x)  # [B, 49, 32]
+                x = self.transformer(x)    # [B, 49, 32]
+                x = self.flatten(x)        # [B, 49 * 32]
+                x = self.dropout1(x)
+                x = self.linear(x)         # [B, sequence_length]
+                x = self.dropout2(x)
+                return x
         
+        model = BERTModel(embed_dim, num_heads, ff_dim, vocab_size, maxlen, self.sequence_length, self.device).to(self.device)
         return model
     
     def partial_fit(self, train_main, train_appliances, do_preprocessing=True, **load_kwargs):
-        print("...............BERT partial_fit running...............")
+        _log_print("...............BERT partial_fit running...............")
         if len(self.appliance_params) == 0:
             self.set_appliance_params(train_appliances)
             
@@ -165,17 +220,21 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, **loa
         
         for appliance_name, power in train_appliances:
             if appliance_name not in self.models:
-                print("First model training for ", appliance_name)
+                _log_print("First model training for ", appliance_name)
                 self.models[appliance_name] = self.return_network()
             else:
-                print("Started Retraining model for ", appliance_name)
+                _log_print("Started Retraining model for ", appliance_name)
                 
             model = self.models[appliance_name]
-            optimizer = optim.Adam(model.parameters())
+            # Use default Adam parameters to match TF's 'adam'
+            optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-07)
             criterion = nn.MSELoss()
             
             if train_main.size > 0:
                 if len(train_main) > 10:
+                    # Create unique filename for model weights like TF version
+                    filepath = checkpoint_path(".pt")
+                    
                     train_x, v_x, train_y, v_y = train_test_split(
                         train_main, power, test_size=.15, random_state=10)
                     
@@ -205,7 +264,7 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, **loa
                             train_loss += loss.item() * batch_mains.size(0)
                             train_loop.set_postfix(loss=loss.item())
                         
-                        train_loss /= len(train_loader.dataset)
+                        train_loss /= len(train_dataset)  # Use dataset length directly
                         
                         # Validation phase with tqdm
                         model.eval()
@@ -221,17 +280,20 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, **loa
                                 val_loss += loss.item() * batch_mains.size(0)
                                 val_loop.set_postfix(loss=loss.item())
                             
-                            val_loss /= len(val_loader.dataset)
+                            val_loss /= len(val_dataset)  # Use dataset length directly
                             
+                            # Save best model (like ModelCheckpoint in TF)
                             if val_loss < best_val_loss:
                                 best_val_loss = val_loss
-                                torch.save(model.state_dict(), f'BERT-temp-weights-{appliance_name}.pt')
-                        
-                        print(f'Epoch {epoch+1}/{self.n_epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
+                                torch.save(model.state_dict(), filepath)
+                                _log_print(f'Epoch {epoch+1}/{self.n_epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f} - Model saved')
+                            else:
+                                _log_print(f'Epoch {epoch+1}/{self.n_epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
                     
-                    model.load_state_dict(torch.load(f'BERT-temp-weights-{appliance_name}.pt'))
+                    # Load best weights (like TF version)
+                    model.load_state_dict(torch.load(filepath))
 
-    # [Rest of the methods remain exactly the same as in the previous version]
+    # Remaining methods keep the legacy backend behavior.
     def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
         if model is not None:
             self.models = model
@@ -262,15 +324,15 @@ def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
                 
                 prediction = np.concatenate(prediction, axis=0)
                 
-                l = self.sequence_length
-                n = len(prediction) + l - 1
+                window_length = self.sequence_length
+                n = len(prediction) + window_length - 1
                 sum_arr = np.zeros((n))
                 counts_arr = np.zeros((n))
-                o = len(sum_arr) 
+                len(sum_arr)
                 
                 for i in range(len(prediction)):
-                    sum_arr[i:i + l] += prediction[i].flatten()
-                    counts_arr[i:i + l] += 1
+                    sum_arr[i:i + window_length] += prediction[i].flatten()
+                    counts_arr[i:i + window_length] += 1
                 
                 for i in range(len(sum_arr)):
                     sum_arr[i] = sum_arr[i] / counts_arr[i]
@@ -304,7 +366,7 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                     app_mean = self.appliance_params[app_name]['mean']
                     app_std = self.appliance_params[app_name]['std']
                 else:
-                    print("Parameters for ", app_name, " were not found!")
+                    _log_print("Parameters for ", app_name, " were not found!")
                     raise ApplianceNotFoundError()
                     
                 processed_app_dfs = []
@@ -324,6 +386,8 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
                 new_mains = mains.values.flatten()
                 n = self.sequence_length
                 units_to_pad = n // 2
+                # TF version doesn't pad during test - comment out padding line
+                # new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
                 new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
                 new_mains = (new_mains - self.mains_mean) / self.mains_std
                 new_mains = new_mains.reshape((-1, self.sequence_length))
@@ -332,9 +396,9 @@ def call_preprocessing(self, mains_lst, submeters_lst, method):
     
     def set_appliance_params(self, train_appliances):
         for (app_name, df_list) in train_appliances:
-            l = np.array(pd.concat(df_list, axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
+            values = np.array(pd.concat(df_list, axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
             if app_std < 1:
                 app_std = 100
-            self.appliance_params.update({app_name: {'mean': app_mean, 'std': app_std}})
\ No newline at end of file
+            self.appliance_params.update({app_name: {'mean': app_mean, 'std': app_std}})
diff --git a/nilmtk_contrib/torch/conv_lstm.py b/nilmtk_contrib/torch/conv_lstm.py
new file mode 100644
index 0000000..67473cb
--- /dev/null
+++ b/nilmtk_contrib/torch/conv_lstm.py
@@ -0,0 +1,360 @@
+from collections import OrderedDict
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import TensorDataset, DataLoader
+from nilmtk.disaggregate import Disaggregator
+
+
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
+class SequenceLengthError(Exception):
+    pass
+
+class ApplianceNotFoundError(Exception):
+    pass
+
+class ConvLSTM(Disaggregator):
+    """
+    Convolutional LSTM for non-intrusive load monitoring.
+    
+    This implementation is based on the paper:
+    "Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting"
+    https://arxiv.org/abs/1506.04214
+    
+    The model adapts the ConvLSTM architecture for energy disaggregation tasks,
+    using spatiotemporal sequence modeling to predict individual appliance power consumption
+    from aggregate household power measurements.
+    
+    Architecture Overview:
+    - Convolutional LSTM layers for spatiotemporal feature learning
+    - Dropout and dense layers for regularization and output prediction
+    - Sequence-to-point prediction for energy disaggregation
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Length of input sequences (default: 99)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
+            - appliance_params (dict): Appliance-specific normalization parameters
+            - mains_mean (float): Mean value for mains normalization (default: 1800)
+            - mains_std (float): Standard deviation for mains normalization (default: 600)
+    """
+    def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        super().__init__()
+        self.MODEL_NAME = "ConvLSTM"
+        self.models = OrderedDict()
+        self.file_prefix = f"{self.MODEL_NAME.lower()}-temp-weights"
+        
+        # Extract legacy hyperparameters used by the Seq2Point-style training path.
+        self.chunk_wise_training = params.get("chunk_wise_training", False)
+        self.sequence_length = params.get("sequence_length", 99)
+        self.n_epochs = params.get("n_epochs", 10)
+        self.batch_size = params.get("batch_size", 512)
+        self.appliance_params = params.get("appliance_params", {})
+        self.mains_mean = params.get("mains_mean", 1800)
+        self.mains_std = params.get("mains_std", 600)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        
+        # Sequence length must be odd for proper windowing
+        if self.sequence_length % 2 == 0:
+            _log_print("Sequence length should be odd!")
+            raise SequenceLengthError
+
+    def return_network(self):
+        """
+        Builds the Conv-LSTM network architecture.
+        """
+        class ConvLSTMNet(nn.Module):
+            def __init__(self, sequence_length):
+                super().__init__()
+                
+                # Convolutional feature extraction layers
+                # Similar to seq2point but with fewer layers for LSTM compatibility
+                self.conv1 = nn.Conv1d(1, 32, kernel_size=8, stride=1, padding=3)
+                self.conv2 = nn.Conv1d(32, 64, kernel_size=6, stride=1, padding=2)
+                self.conv3 = nn.Conv1d(64, 128, kernel_size=4, stride=1, padding=1)
+                
+                # Calculate conv output length
+                self.conv_output_dim = 128
+                
+                # Dropout for regularization
+                self.dropout1 = nn.Dropout(0.2)
+                
+                # BiLSTM layers for temporal modeling
+                self.lstm1 = nn.LSTM(
+                    input_size=self.conv_output_dim,
+                    hidden_size=128,
+                    num_layers=1,
+                    batch_first=True,
+                    bidirectional=True,
+                    dropout=0.0
+                )
+                
+                self.lstm2 = nn.LSTM(
+                    input_size=256,  # 128 * 2 (bidirectional)
+                    hidden_size=64,
+                    num_layers=1,
+                    batch_first=True,
+                    bidirectional=True,
+                    dropout=0.0
+                )
+                
+                self.dropout2 = nn.Dropout(0.2)
+                
+                # Final prediction layers
+                self.fc1 = nn.Linear(128, 64)  # 64 * 2 (bidirectional)
+                self.fc2 = nn.Linear(64, 1)
+                
+                # Initialize weights
+                self._initialize_weights()
+            
+            def _initialize_weights(self):
+                """
+                Initializes model weights.
+                """
+                for m in self.modules():
+                    if isinstance(m, nn.Conv1d):
+                        nn.init.xavier_uniform_(m.weight)
+                        if m.bias is not None:
+                            nn.init.zeros_(m.bias)
+                    elif isinstance(m, nn.Linear):
+                        nn.init.xavier_uniform_(m.weight)
+                        if m.bias is not None:
+                            nn.init.zeros_(m.bias)
+                    elif isinstance(m, nn.LSTM):
+                        for name, param in m.named_parameters():
+                            if 'weight_ih' in name:
+                                nn.init.xavier_uniform_(param.data)
+                            elif 'weight_hh' in name:
+                                nn.init.orthogonal_(param.data)
+                            elif 'bias' in name:
+                                nn.init.zeros_(param.data)
+            
+            def forward(self, x):
+                # x shape: (batch_size, 1, sequence_length)
+                
+                # Convolutional feature extraction
+                x = torch.relu(self.conv1(x))
+                x = torch.relu(self.conv2(x))
+                x = torch.relu(self.conv3(x))
+                x = self.dropout1(x)
+                
+                # Reshape for LSTM: (batch_size, sequence_length, features)
+                x = x.transpose(1, 2)  # (batch_size, sequence_length, conv_output_dim)
+                
+                # BiLSTM layers
+                x, _ = self.lstm1(x)
+                x, _ = self.lstm2(x)
+                x = self.dropout2(x)
+                
+                # Take the last timestep output for sequence-to-point prediction
+                x = x[:, -1, :]  # (batch_size, hidden_size * 2)
+                
+                # Final prediction layers
+                x = torch.relu(self.fc1(x))
+                x = self.fc2(x)
+                
+                return x
+        
+        model = ConvLSTMNet(self.sequence_length).to(self.device)
+        return model
+
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """
+        Preprocesses data by creating sliding windows, same as seq2point.
+        """
+        if method == 'train':
+            # Preprocessing for the train data follows the Seq2Point-style path.
+            mains_df_list = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                mains_df_list.append(pd.DataFrame(new_mains))
+            
+            appliance_list = []
+            for app_index, (app_name, app_df_list) in enumerate(submeters_lst):
+                if app_name in self.appliance_params:
+                    app_mean = self.appliance_params[app_name]['mean']
+                    app_std = self.appliance_params[app_name]['std']
+                else:
+                    _log_print("Parameters for", app_name, "were not found!")
+                    raise ApplianceNotFoundError()
+
+                processed_appliance_dfs = []
+                for app_df in app_df_list:
+                    new_app_readings = app_df.values.reshape((-1, 1))
+                    # This is for choosing windows
+                    new_app_readings = (new_app_readings - app_mean) / app_std  
+                    # Return as a list of dataframe
+                    processed_appliance_dfs.append(pd.DataFrame(new_app_readings))
+                appliance_list.append((app_name, processed_appliance_dfs))
+            return mains_df_list, appliance_list
+        
+        else:
+            # Preprocessing for the test data follows the Seq2Point-style path.
+            mains_df_list = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                mains_df_list.append(pd.DataFrame(new_mains))
+            return mains_df_list
+
+    def set_appliance_params(self, train_appliances):
+        """
+        Computes and sets normalization parameters for each appliance.
+        """
+        for app_name, df_list in train_appliances:
+            values = np.array(pd.concat(df_list, axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            if app_std < 1:
+                app_std = 100
+            self.appliance_params.update({app_name: {'mean': app_mean, 'std': app_std}})
+        _log_print(self.appliance_params)
+
+    def partial_fit(self, train_main, train_appliances, do_preprocessing=True, current_epoch=0, **load_kwargs):
+        """
+        Trains the Conv-LSTM model on a chunk of data.
+        """
+        # If no appliance wise parameters are provided, then compute them using the first chunk
+        if len(self.appliance_params) == 0:
+            self.set_appliance_params(train_appliances)
+
+        _log_print("...............ConvLSTM partial_fit running...............")
+        # Do the pre-processing, such as windowing and normalizing
+        if do_preprocessing:
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+
+        train_main = pd.concat(train_main, axis=0)
+        train_main = train_main.values.reshape((-1, self.sequence_length, 1))
+        new_train_appliances = []
+        for app_name, app_df in train_appliances:
+            app_df = pd.concat(app_df, axis=0)
+            app_df_values = app_df.values.reshape((-1, 1))
+            new_train_appliances.append((app_name, app_df_values))
+        train_appliances = new_train_appliances
+
+        for appliance_name, power in train_appliances:
+            # Check if the appliance was already trained. If not then create a new model for it
+            if appliance_name not in self.models:
+                _log_print("First model training for", appliance_name)
+                self.models[appliance_name] = self.return_network()
+            # Retrain the particular appliance
+            else:
+                _log_print("Started Retraining model for", appliance_name)
+
+            model = self.models[appliance_name]
+            if train_main.size > 0:
+                # Sometimes chunks can be empty after dropping NANS
+                if len(train_main) > 10:
+                    # Convert to PyTorch tensors and correct format
+                    # PyTorch Conv1d expects (batch, channels, length)
+                    train_main_tensor = torch.tensor(train_main, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+                    power_tensor = torch.tensor(power, dtype=torch.float32).squeeze().to(self.device)
+                    
+                    # Create validation split
+                    n_samples = train_main_tensor.size(0)
+                    val_size = max(1, int(0.15 * n_samples)) if n_samples > 1 else 0
+                    indices = torch.randperm(n_samples)
+                    train_idx, val_idx = indices[val_size:], indices[:val_size]
+                    
+                    train_X = train_main_tensor[train_idx]
+                    train_y = power_tensor[train_idx]
+                    val_X = train_main_tensor[val_idx]
+                    val_y = power_tensor[val_idx]
+                    
+                    # Setup optimizer and loss
+                    optimizer = torch.optim.Adam(model.parameters())
+                    criterion = nn.MSELoss()
+                    
+                    best_val_loss = float('inf')
+                    filepath = checkpoint_path(".pth")
+                    
+                    # Training loop follows the Seq2Point-style behavior.
+                    for epoch in range(self.n_epochs):
+                        model.train()
+                        
+                        # Create batches
+                        train_dataset = TensorDataset(train_X, train_y)
+                        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
+                        
+                        epoch_losses = []
+                        for batch_X, batch_y in train_loader:
+                            optimizer.zero_grad()
+                            predictions = model(batch_X).squeeze()
+                            loss = criterion(predictions, batch_y)
+                            loss.backward()
+                            
+                            # Add gradient clipping like seq2point_new
+                            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                            
+                            optimizer.step()
+                            epoch_losses.append(loss.item())
+                        
+                        # Validation
+                        model.eval()
+                        with torch.no_grad():
+                            val_predictions = model(val_X).squeeze()
+                            val_loss = criterion(val_predictions, val_y).item()
+                        
+                        avg_train_loss = np.mean(epoch_losses)
+                        _log_print(f"Epoch {epoch+1}/{self.n_epochs} - loss: {avg_train_loss:.4f} - val_loss: {val_loss:.4f}")
+                        
+                        # Save best model using the legacy checkpoint behavior.
+                        if val_loss < best_val_loss:
+                            best_val_loss = val_loss
+                            torch.save(model.state_dict(), filepath)
+                            _log_print(f"Validation loss improved, saving model to {filepath}")
+                    
+                    # Load best weights
+                    model.load_state_dict(torch.load(filepath, map_location=self.device))
+
+    def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
+        """
+        Disaggregates a chunk of mains power data.
+        """
+        if model is not None:
+            self.models = model
+
+        # Preprocess the test mains such as windowing and normalizing
+        if do_preprocessing:
+            test_main_list = self.call_preprocessing(test_main_list, submeters_lst=None, method='test')
+
+        test_predictions = []
+        for test_main in test_main_list:
+            test_main = test_main.values
+            test_main = test_main.reshape((-1, self.sequence_length, 1))
+            
+            # Convert to PyTorch tensor with correct format for Conv1d
+            test_main_tensor = torch.tensor(test_main, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+            
+            disggregation_dict = {}
+            for appliance in self.models:
+                model = self.models[appliance]
+                model.eval()
+                with torch.no_grad():
+                    prediction = model(test_main_tensor).cpu().numpy()
+                    # Denormalize with the Seq2Point-style appliance parameters.
+                    prediction = self.appliance_params[appliance]['mean'] + prediction * self.appliance_params[appliance]['std']
+                    valid_predictions = prediction.flatten()
+                    valid_predictions = np.where(valid_predictions > 0, valid_predictions, 0)
+                    df = pd.Series(valid_predictions)
+                    disggregation_dict[appliance] = df
+            results = pd.DataFrame(disggregation_dict, dtype='float32')
+            test_predictions.append(results)
+        return test_predictions
diff --git a/nilmtk_contrib/torch/dae.py b/nilmtk_contrib/torch/dae.py
index 4fc6c67..add12b2 100644
--- a/nilmtk_contrib/torch/dae.py
+++ b/nilmtk_contrib/torch/dae.py
@@ -1,10 +1,31 @@
-import os, json
-import torch, torch.nn as nn, torch.optim as optim
-import numpy as np, pandas as pd
+import json
+from pathlib import Path
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+import pandas as pd
 from tqdm import tqdm
 from collections import OrderedDict
 from torch.utils.data import TensorDataset, DataLoader
 from nilmtk.disaggregate import Disaggregator
+from nilmtk_contrib.utils.checkpoints import (
+    build_metadata,
+    collect_dependencies,
+    load_metadata,
+    load_torch_state,
+    save_metadata,
+    save_torch_state,
+    temporary_checkpoint,
+)
+from nilmtk_contrib.utils.logging import get_logger
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print
+from nilmtk_contrib.utils.params import normalize_common_params
+from nilmtk_contrib.utils.random import set_random_seed
+from nilmtk_contrib.utils.validation import train_validation_split
+
+logger = get_logger(__name__)
+_log_print = legacy_print(logger)
 
 class DAEModel(nn.Module):
     """
@@ -36,24 +57,76 @@ def forward(self, x):
         return x
 
 class DAE(Disaggregator):
+    """
+    Denoising Autoencoder for non-intrusive load monitoring.
+    
+    This implementation is based on the paper:
+    "Neural NILM: Deep Neural Networks Applied to Energy Disaggregation"
+    https://arxiv.org/abs/1507.06594
+    
+    The model uses a denoising autoencoder architecture for energy disaggregation tasks,
+    learning to reconstruct individual appliance power consumption from aggregate
+    household power measurements.
+    
+    Architecture Overview:
+    - Convolutional encoder layer for feature extraction
+    - Fully connected bottleneck layers for dimensionality reduction
+    - Convolutional decoder layer for sequence reconstruction
+    - Sequence-to-sequence prediction for energy disaggregation
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Length of input sequences (default: 99)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - mains_mean (float): Mean value for mains normalization (default: 1000)
+            - mains_std (float): Standard deviation for mains normalization (default: 600)
+            - appliance_params (dict): Appliance-specific normalization parameters
+            - save-model-path (str): Path to save trained models
+            - pretrained-model-path (str): Path to load pre-trained models
+    """
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
         super().__init__()
+        common = normalize_common_params(
+            params,
+            defaults={
+                "sequence_length": 99,
+                "n_epochs": 10,
+                "batch_size": 512,
+                "mains_mean": 1000,
+                "mains_std": 600,
+                "appliance_params": {},
+                "save_model_path": None,
+                "pretrained_model_path": None,
+                "chunk_wise_training": False,
+                "seed": None,
+                "verbose": False,
+                "device": None,
+            },
+        )
         self.MODEL_NAME        = "DAE"
         self.file_prefix       = f"{self.MODEL_NAME.lower()}-temp-weights"
-        self.sequence_length   = params.get('sequence_length', 99)
-        self.n_epochs          = params.get('n_epochs', 10)
-        self.batch_size        = params.get('batch_size', 512)
-        self.mains_mean        = params.get('mains_mean', 1000)
-        self.mains_std         = params.get('mains_std', 600)
-        self.appliance_params  = params.get('appliance_params', {})
-        self.save_model_path   = params.get('save-model-path', None)
-        self.load_model_path   = params.get('pretrained-model-path', None)
+        self.sequence_length   = common.sequence_length
+        self.n_epochs          = common.n_epochs
+        self.batch_size        = common.batch_size
+        self.mains_mean        = common.mains_mean
+        self.mains_std         = common.mains_std
+        self.appliance_params  = common.appliance_params
+        self.save_model_path   = common.save_model_path
+        self.load_model_path   = common.pretrained_model_path
+        self.chunk_wise_training = common.chunk_wise_training
+        self.seed              = common.seed
+        self.verbose           = common.verbose
         self.models            = OrderedDict()
-        self.device            = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        device = common.device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.device            = torch.device(device)
+        set_random_seed(self.seed, backends=("python", "numpy", "torch"))
         if self.load_model_path:
             self.load_model()
 
     def return_network(self):
+        """Returns the DAE model."""
         return DAEModel(self.sequence_length).to(self.device)
 
     def set_appliance_params(self, train_appliances):
@@ -63,10 +136,14 @@ def set_appliance_params(self, train_appliances):
         for name, lst in train_appliances:
             arr = pd.concat(lst, axis=0).values.flatten()
             m, s = arr.mean(), arr.std()
-            if s < 1: s = 100  # avoid zero std
+            if s < 1:
+                s = 100  # avoid zero std
             self.appliance_params[name] = {'mean': m, 'std': s}
 
     def normalize_input(self, data, n, mean, std, overlap):
+        """
+        Normalizes and windows the input data.
+        """
         flat = data.flatten()
         pad  = (n - flat.size % n) % n
         flat = np.concatenate([flat, np.zeros(pad)])
@@ -79,11 +156,14 @@ def normalize_input(self, data, n, mean, std, overlap):
         return ((w - mean)/std).reshape(-1, n, 1)  # normalize and reshape for model
 
     def denormalize_output(self, data, mean, std):
+        """
+        Denormalizes the output data.
+        """
         return mean + data*std
 
     def call_preprocessing(self, mains_lst, subs, method):
         """
-        Preprocess the mains and appliances data for training or testing.
+        Preprocesses the mains and appliance data.
         """
         if method == 'train':
             pm, apps = [], []
@@ -119,6 +199,9 @@ def call_preprocessing(self, mains_lst, subs, method):
         return pm
 
     def partial_fit(self, train_main, train_appliances, do_preprocessing=True, current_epoch=0, **_):
+        """
+        Trains the model on a chunk of data.
+        """
         if not self.appliance_params:
             self.set_appliance_params(train_appliances)
 
@@ -140,72 +223,111 @@ def partial_fit(self, train_main, train_appliances, do_preprocessing=True, curre
 
             X = torch.tensor(mains_arr, dtype=torch.float32)  # mains input
             Y = torch.tensor(arr, dtype=torch.float32)  # appliance output
-            split = int(len(X)*0.85)
-            tr_ds = TensorDataset(X[:split], Y[:split])  # train set
-            va_ds = TensorDataset(X[split:], Y[split:])  # validation set
+            split = train_validation_split(
+                X,
+                Y,
+                validation_fraction=0.15,
+                strategy="tail",
+                min_train=1,
+                min_val=1,
+                allow_no_validation=True,
+            )
+            if not split.metadata.should_train:
+                continue
+
+            tr_ds = TensorDataset(split.X_train, split.y_train)  # train set
             tr = DataLoader(tr_ds, batch_size=self.batch_size, shuffle=True)  # train loader
-            va = DataLoader(va_ds, batch_size=self.batch_size)  # validation loader
+            va = None
+            if split.metadata.validation_enabled:
+                va_ds = TensorDataset(split.X_val, split.y_val)  # validation set
+                va = DataLoader(va_ds, batch_size=self.batch_size)  # validation loader
 
             opt     = optim.Adam(model.parameters())
             loss_fn = nn.MSELoss()
-            best    = float('inf')
-            ckpt    = f"{self.file_prefix}-{name.replace(' ','_')}-epoch{current_epoch}.pt"
-
-            for _ in tqdm(range(self.n_epochs), desc=name):
-                model.train()
-                for xb, yb in tr:
-                    xb, yb = xb.to(self.device), yb.to(self.device)
-                    opt.zero_grad()
-                    out = model(xb)
-                    loss_fn(out, yb).backward()
-                    opt.step()
-
-                model.eval()
-                vl = []
-                with torch.no_grad():
-                    for xb, yb in va:
+            best = float('inf')
+            with temporary_checkpoint(".pt") as ckpt:
+                epochs = tqdm(range(self.n_epochs), desc=name, disable=not self.verbose)
+                for _ in epochs:
+                    model.train()
+                    for xb, yb in tr:
                         xb, yb = xb.to(self.device), yb.to(self.device)
-                        vl.append(loss_fn(model(xb), yb).item())
-                val_loss = sum(vl)/len(vl)
-                if val_loss < best:
-                    best = val_loss
-                    torch.save(model.state_dict(), ckpt)
+                        opt.zero_grad()
+                        out = model(xb)
+                        loss_fn(out, yb).backward()
+                        opt.step()
+
+                    if va is None:
+                        save_torch_state(model, ckpt)
+                    else:
+                        model.eval()
+                        vl = []
+                        with torch.no_grad():
+                            for xb, yb in va:
+                                xb, yb = xb.to(self.device), yb.to(self.device)
+                                vl.append(loss_fn(model(xb), yb).item())
+                        if vl:
+                            val_loss = sum(vl)/len(vl)
+                            if val_loss < best:
+                                best = val_loss
+                                save_torch_state(model, ckpt)
 
-            model.load_state_dict(torch.load(ckpt, map_location=self.device))
+                if ckpt.exists():
+                    load_torch_state(model, ckpt, self.device)
 
         if self.save_model_path:
             self.save_model()
 
     def save_model(self):
-        os.makedirs(self.save_model_path, exist_ok=True)
-        params = {
-            'sequence_length': self.sequence_length,
-            'mains_mean':      self.mains_mean,
-            'mains_std':       self.mains_std,
-            'appliance_params':self.appliance_params
-        }
-        with open(os.path.join(self.save_model_path,'model.json'),'w') as f:
-            json.dump(params, f)
+        """
+        Saves the trained model and parameters.
+        """
+        model_folder = Path(self.save_model_path)
+        model_folder.mkdir(parents=True, exist_ok=True)
+        metadata = build_metadata(
+            model_class=self.MODEL_NAME,
+            backend="torch",
+            sequence_length=self.sequence_length,
+            appliance_params=self.appliance_params,
+            mains_mean=self.mains_mean,
+            mains_std=self.mains_std,
+            dependencies=collect_dependencies(["nilmtk-contrib", "torch", "numpy", "pandas"]),
+        )
+        save_metadata(model_folder, metadata)
         for name, m in self.models.items():
-            torch.save(m.state_dict(),
-                       os.path.join(self.save_model_path, f"{name}.pt"))
+            logger.info("Saving %s model for %s.", self.MODEL_NAME, name)
+            save_torch_state(m, model_folder / f"{name}.pt")
 
     def load_model(self):
-        with open(os.path.join(self.load_model_path,'model.json')) as f:
-            p = json.load(f)
+        """
+        Loads a pre-trained model and its parameters.
+        """
+        model_folder = Path(self.load_model_path)
+        metadata_path = model_folder / "metadata.json"
+        if metadata_path.exists():
+            p = load_metadata(
+                model_folder,
+                expected_model_class=self.MODEL_NAME,
+                expected_backend="torch",
+            )
+        else:
+            logger.warning(
+                "Loading legacy %s model metadata from model.json.", self.MODEL_NAME
+            )
+            with open(model_folder / 'model.json') as f:
+                p = json.load(f)
         self.sequence_length = p['sequence_length']
         self.mains_mean      = p['mains_mean']
         self.mains_std       = p['mains_std']
         self.appliance_params= p['appliance_params']
         for name in self.appliance_params:
             m = self.return_network()
-            m.load_state_dict(torch.load(
-                os.path.join(self.load_model_path, f"{name}.pt"),
-                map_location=self.device
-            ))
+            load_torch_state(m, model_folder / f"{name}.pt", self.device)
             self.models[name] = m
 
     def disaggregate_chunk(self, test_main_list, do_preprocessing=True):
+        """
+        Disaggregates a chunk of mains data.
+        """
         if do_preprocessing:
             test_main_list = self.call_preprocessing(
                 test_main_list, None, 'test'
@@ -232,4 +354,4 @@ def disaggregate_chunk(self, test_main_list, do_preprocessing=True):
                 p_den = np.clip(p_den, 0, None)
                 outd[name] = pd.Series(p_den)
             results.append(pd.DataFrame(outd, dtype='float32'))
-        return results
\ No newline at end of file
+        return results
diff --git a/nilmtk_contrib/torch/msdc.py b/nilmtk_contrib/torch/msdc.py
new file mode 100644
index 0000000..1d5ce1e
--- /dev/null
+++ b/nilmtk_contrib/torch/msdc.py
@@ -0,0 +1,692 @@
+from collections import OrderedDict
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+from nilmtk.disaggregate import Disaggregator
+
+
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
+class SequenceLengthError(Exception):
+    pass
+
+
+class ApplianceNotFoundError(Exception):
+    pass
+
+
+class MSDCNet(nn.Module):
+    """
+    Dual-branch CNN for joint state classification and power prediction.
+    - Branch 1: Predicts state emission scores for a CRF.
+    - Branch 2: Predicts power consumption for each state.
+    - CRF layer models state transitions.
+    """
+    
+    def __init__(self, window_length, num_states):
+        super(MSDCNet, self).__init__()
+        self.window_length = window_length
+        self.num_states = num_states
+        
+        # Shared CNN feature extractor
+        self.shared_cnn = nn.Sequential(
+            nn.Conv1d(1, 32, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv1d(32, 64, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.AdaptiveAvgPool1d(1)
+        )
+        
+        # Branch 1: State emission scores for CRF
+        self.state_branch = nn.Sequential(
+            nn.Linear(64, 128),
+            nn.ReLU(),
+            nn.Dropout(0.5),
+            nn.Linear(128, num_states)
+        )
+        
+        # Branch 2: Power predictions for each state
+        self.power_branch = nn.Sequential(
+            nn.Linear(64, 128),
+            nn.ReLU(),
+            nn.Dropout(0.5),
+            nn.Linear(128, num_states)
+        )
+        
+        # CRF layer for state sequence modeling
+        self.crf = CRF(num_states)
+    
+    def forward(self, x):
+        """
+        Forward pass through the network.
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, window_length)
+        
+        Returns:
+            emissions: State emission scores (batch_size, seq_len, num_states)
+            power_preds: Power predictions for each state (batch_size, seq_len, num_states)
+        """
+        batch_size, seq_len, window_length = x.shape
+        
+        # Reshape for CNN: (batch_size * seq_len, 1, window_length)
+        x_reshaped = x.view(-1, 1, window_length)
+        
+        # Extract features using shared CNN
+        features = self.shared_cnn(x_reshaped)  # (batch_size * seq_len, 64, 1)
+        features = features.squeeze(-1)  # (batch_size * seq_len, 64)
+        
+        # Branch 1: State emissions
+        emissions = self.state_branch(features)  # (batch_size * seq_len, num_states)
+        emissions = emissions.view(batch_size, seq_len, self.num_states)
+        
+        # Branch 2: Power predictions
+        power_preds = self.power_branch(features)  # (batch_size * seq_len, num_states)
+        power_preds = power_preds.view(batch_size, seq_len, self.num_states)
+        
+        return emissions, power_preds
+
+
+class CRF(nn.Module):
+    """Conditional Random Field for sequence modeling."""
+    
+    def __init__(self, num_states):
+        super(CRF, self).__init__()
+        self.num_states = num_states
+        
+        # Transition parameters
+        self.transitions = nn.Parameter(torch.randn(num_states, num_states))
+        self.start_transitions = nn.Parameter(torch.randn(num_states))
+        self.end_transitions = nn.Parameter(torch.randn(num_states))
+    
+    def forward(self, emissions):
+        """Computes the log partition function using the forward algorithm."""
+        batch_size, seq_len, num_states = emissions.shape
+        
+        # Initialize with start transitions
+        alpha = emissions[:, 0] + self.start_transitions.unsqueeze(0)
+        
+        # Forward pass
+        for t in range(1, seq_len):
+            alpha_expanded = alpha.unsqueeze(2)  # (batch_size, num_states, 1)
+            trans_scores = alpha_expanded + self.transitions.unsqueeze(0)  # (batch_size, num_states, num_states)
+            alpha = torch.logsumexp(trans_scores, dim=1) + emissions[:, t]
+        
+        # Add end transitions
+        log_partition = torch.logsumexp(alpha + self.end_transitions.unsqueeze(0), dim=1)
+        return log_partition
+    
+    def score_sequence(self, emissions, states):
+        """Computes the log-likelihood of a given state sequence."""
+        batch_size, seq_len = states.shape
+        
+        # Start transition score
+        score = self.start_transitions[states[:, 0]]
+        
+        # Emission scores
+        for t in range(seq_len):
+            score += emissions[range(batch_size), t, states[:, t]]
+        
+        # Transition scores
+        for t in range(seq_len - 1):
+            score += self.transitions[states[:, t], states[:, t + 1]]
+        
+        # End transition score
+        score += self.end_transitions[states[:, -1]]
+        
+        return score
+    
+    def viterbi_decode(self, emissions):
+        """Finds the most likely state sequence using the Viterbi algorithm."""
+        batch_size, seq_len, num_states = emissions.shape
+        
+        # Initialize
+        delta = emissions[:, 0] + self.start_transitions.unsqueeze(0)
+        psi = torch.zeros(batch_size, seq_len, num_states, dtype=torch.long, device=emissions.device)
+        
+        # Forward pass
+        for t in range(1, seq_len):
+            delta_expanded = delta.unsqueeze(2)  # (batch_size, num_states, 1)
+            trans_scores = delta_expanded + self.transitions.unsqueeze(0)  # (batch_size, num_states, num_states)
+            
+            delta_next, psi[:, t] = torch.max(trans_scores, dim=1)
+            delta = delta_next + emissions[:, t]
+        
+        # Add end transitions and find best final state
+        final_scores = delta + self.end_transitions.unsqueeze(0)
+        best_final_states = torch.argmax(final_scores, dim=1)
+        
+        # Backward pass to reconstruct path
+        best_paths = torch.zeros(batch_size, seq_len, dtype=torch.long, device=emissions.device)
+        best_paths[:, -1] = best_final_states
+        
+        for t in range(seq_len - 2, -1, -1):
+            best_paths[:, t] = psi[range(batch_size), t + 1, best_paths[:, t + 1]]
+        
+        return best_paths
+
+
+class MSDC(Disaggregator):
+    """
+    Multi-State Dual CNN for non-intrusive load monitoring.
+    
+    This implementation is based on the paper:
+    "MSDC: Exploiting Multi-State Power Consumption in Non-intrusive Load Monitoring based on A Dual-CNN Model"
+    https://arxiv.org/abs/2302.05565
+    
+    The model uses a dual-branch CNN architecture with a CRF layer for joint state 
+    classification and power prediction in energy disaggregation tasks.
+    
+    Architecture Overview:
+    - Dual-branch CNN for feature extraction
+    - Branch 1: State emission scores for CRF layer
+    - Branch 2: Power consumption prediction for each state
+    - CRF layer for modeling state transitions
+    - Multi-state power consumption modeling
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Length of input sequences
+            - n_epochs (int): Number of training epochs
+            - batch_size (int): Training batch size
+            - appliance_params (dict): Appliance-specific normalization parameters
+    """
+    
+    # Dataset-specific configurations from the official MSDC implementation
+    APPLIANCE_STATES = {
+        'kettle': {
+            'uk_dale': {
+                'states': [2000, 4500],
+                'state_averages': [1.15, 2280.79],
+                'num_states': 2,
+                'threshold': 2000
+            }
+            # No REDD config for kettle in original - will fallback to UK-DALE
+        },
+        'microwave': {
+            'uk_dale': {
+                'states': [300, 3000],
+                'state_averages': [1.4, 1551.3],
+                'num_states': 2,
+                'threshold': 300
+            },
+            'redd': {
+                'states': [300, 3000],
+                'state_averages': [4.2, 1557.501],
+                'num_states': 2,
+                'threshold': 300
+            }
+        },
+        'fridge': {
+            'uk_dale': {
+                'states': [20, 200, 2500],
+                'state_averages': [0.13, 87.26, 246.5],
+                'num_states': 3,
+                'threshold': 20
+            },
+            'redd': {
+                'states': [50, 300, 500],
+                'state_averages': [3.2, 143.3, 397.3],
+                'num_states': 3,
+                'threshold': 50
+            },
+            'redd_house1': {
+                'states': [50, 300, 500],
+                'state_averages': [6.49, 192.57, 443],
+                'num_states': 3,
+                'threshold': 50
+            },
+            'redd_house2': {
+                'states': [50, 300, 500],
+                'state_averages': [6.34, 162.87, 418.36],
+                'num_states': 3,
+                'threshold': 50
+            },
+            'redd_house3': {
+                'states': [50, 300, 500],
+                'state_averages': [0.54, 118.85, 409.75],
+                'num_states': 3,
+                'threshold': 50
+            }
+        },
+        'dishwasher': {
+            'uk_dale': {
+                'states': [50, 1000, 4500],
+                'state_averages': [0.89, 122.56, 2324.9],
+                'num_states': 3,
+                'threshold': 50
+            },
+            'redd': {
+                'states': [150, 300, 1000, 3000],
+                'state_averages': [0.57, 232.91, 733.89, 1198.31],
+                'num_states': 4,
+                'threshold': 150
+            },
+            'redd_house1': {
+                'states': [150, 300, 1000, 3000],
+                'state_averages': [0.21, 216.75, 438.51, 1105.08],
+                'num_states': 4,
+                'threshold': 150
+            },
+            'redd_house2': {
+                'states': [150, 1000, 3000],
+                'state_averages': [0.16, 250.26, 1197.93],
+                'num_states': 3,
+                'threshold': 150
+            },
+            'redd_house3': {
+                'states': [50, 400, 1000],
+                'state_averages': [0.97, 195.6, 743.42],
+                'num_states': 3,
+                'threshold': 50
+            }
+        },
+        'washingmachine': {
+            'uk_dale': {
+                'states': [50, 800, 3500],
+                'state_averages': [0.13, 204.64, 1892.85],
+                'num_states': 3,
+                'threshold': 50
+            },
+            'uk_dale_house2': {
+                'states': [50, 200, 1000, 4000],
+                'state_averages': [2.83, 114.34, 330.25, 2100.14],
+                'num_states': 4,
+                'threshold': 50
+            },
+            'redd': {
+                'states': [500, 5000],
+                'state_averages': [0, 2627.3],
+                'num_states': 2,
+                'threshold': 500
+            }
+        }
+    }
+    
+    # Dataset-specific normalization parameters
+    DATASET_NORMALIZATION = {
+        'uk_dale': {
+            'mains_mean': 1800,
+            'mains_std': 600
+        },
+        'redd': {
+            'mains_mean': 352.32,  # From official MSDC REDD implementation
+            'mains_std': 608.42
+        }
+    }
+    
+    def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        super().__init__()
+        
+        self.MODEL_NAME = "MSDC"
+        self.file_prefix = f"{self.MODEL_NAME.lower()}-temp-weights"
+        
+        # Dataset configuration
+        self.dataset = params.get('dataset', 'uk_dale').lower()
+        self.house = params.get('house', None)
+        
+        # Validate and build dataset key
+        if self.dataset not in ['uk_dale', 'redd']:
+            _log_print(f"Warning: Unknown dataset '{self.dataset}'. Defaulting to 'uk_dale'.")
+            self.dataset = 'uk_dale'
+        
+        self.dataset_key = f"{self.dataset}_house{self.house}" if self.house else self.dataset
+        
+        # Hyperparameters
+        self.sequence_length = params.get('sequence_length', 99)
+        if self.sequence_length % 2 == 0:
+            raise SequenceLengthError("Sequence length must be odd")
+            
+        self.num_states = params.get('num_states', 3)  # Will be overridden by appliance config
+        self.n_epochs = params.get('n_epochs', 50)
+        self.batch_size = params.get('batch_size', 256)
+        self.learning_rate = params.get('learning_rate', 0.001)
+        self.patience = params.get('patience', 5)
+        
+        # Dataset-specific normalization parameters
+        dataset_norm = self.DATASET_NORMALIZATION.get(self.dataset, self.DATASET_NORMALIZATION['uk_dale'])
+        self.mains_mean = params.get('mains_mean', dataset_norm['mains_mean'])
+        self.mains_std = params.get('mains_std', dataset_norm['mains_std'])
+        self.appliance_params = params.get('appliance_params', {})
+        
+        # Model and device configuration
+        self.models = OrderedDict()
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        
+        # Display configuration
+        _log_print(f"MSDC initialized for dataset: {self.dataset.upper()}")
+        if self.house:
+            _log_print(f"House: {self.house}")
+        _log_print(f"Configuration key: {self.dataset_key}")
+        _log_print(f"Mains normalization - mean: {self.mains_mean}, std: {self.mains_std}")
+    
+    def _get_appliance_config(self, appliance_name):
+        """Retrieves the best available configuration for an appliance."""
+        if appliance_name not in self.APPLIANCE_STATES:
+            return None
+        
+        appliance_configs = self.APPLIANCE_STATES[appliance_name]
+        
+        # Priority: specific house > dataset > any available config
+        if self.dataset_key in appliance_configs:
+            return appliance_configs[self.dataset_key]
+        elif self.dataset in appliance_configs:
+            return appliance_configs[self.dataset]
+        else:
+            # Use any available configuration as fallback
+            available_configs = list(appliance_configs.keys())
+            if available_configs:
+                fallback_key = available_configs[0]
+                _log_print(f"Warning: No {self.dataset_key} config for {appliance_name}, using {fallback_key}")
+                return appliance_configs[fallback_key]
+        
+        return None
+    
+    def return_network(self, appliance_name):
+        """Creates an MSDC model instance for a specific appliance."""
+        config = self._get_appliance_config(appliance_name)
+        if config:
+            num_states = config['num_states']
+            _log_print(f"Creating network for {appliance_name} with {num_states} states ({self.dataset_key})")
+        else:
+            num_states = self.num_states  # fallback to default
+            _log_print(f"Warning: No config found for {appliance_name}, using default {num_states} states")
+        
+        return MSDCNet(self.sequence_length, num_states).to(self.device)
+    
+    def set_appliance_params(self, train_appliances):
+        """Computes and sets normalization parameters for each appliance."""
+        for name, lst in train_appliances:
+            arr = pd.concat(lst, axis=0).values.flatten()
+            m, s = arr.mean(), arr.std()
+            # Avoid division by zero
+            if s < 1:
+                s = 100
+            _log_print(f"Computed normalization for {name}: mean={m:.2f}, std={s:.2f}")
+            
+            self.appliance_params[name] = {'mean': m, 'std': s}
+    
+    def _create_state_labels(self, power_sequence, appliance_name):
+        """
+        Generates state labels based on dataset-specific configurations.
+        """
+        power = power_sequence.flatten()
+        
+        # Get appliance configuration
+        config = self._get_appliance_config(appliance_name)
+        
+        if config:
+            thresholds = config['states']
+            num_states = config['num_states']
+        else:
+            # Fallback to dynamic thresholds if no config is found
+            mean_power = self.appliance_params.get(appliance_name, {}).get('mean', power.mean())
+            num_states = self.num_states
+            
+            if num_states == 2:
+                thresholds = [0.1 * mean_power]
+            elif num_states == 3:
+                thresholds = [0.1 * mean_power, 0.7 * mean_power]
+            else:
+                thresholds = np.linspace(0, mean_power * 1.2, num_states)[1:]
+        
+        # Create state labels based on thresholds
+        states = np.zeros_like(power, dtype=np.int64)
+        
+        for i, threshold in enumerate(thresholds):
+            states[power >= threshold] = i + 1
+        
+        # Ensure states are within valid range
+        states = np.clip(states, 0, num_states - 1)
+        
+        return states.astype(np.int64)
+    
+    def _compute_msdc_loss(self, model, x, y_power, y_states, appliance_name):
+        """
+        Computes the combined MSDC loss.
+        - CRF negative log-likelihood for state sequence.
+        - MSE for per-state power predictions.
+        - MSE for final power prediction based on Viterbi-decoded states.
+        """
+        # Forward pass
+        emissions, power_preds = model(x)
+        
+        # Use the model's CRF
+        crf = model.crf
+        
+        # Get number of states for the appliance
+        config = self._get_appliance_config(appliance_name)
+        num_states = config['num_states'] if config else self.num_states
+        
+        # 1. CRF loss (negative log-likelihood)
+        log_partition = crf(emissions)
+        sequence_scores = crf.score_sequence(emissions, y_states)
+        crf_loss = torch.mean(log_partition - sequence_scores)
+        
+        # 2. Per-state power loss
+        batch_size, seq_len = y_states.shape
+        state_power_loss = 0
+        for state_id in range(num_states):
+            state_mask = (y_states == state_id).float()
+            if state_mask.sum() > 0:
+                state_power_pred = power_preds[:, :, state_id]
+                masked_pred = state_power_pred * state_mask
+                masked_target = y_power * state_mask
+                state_power_loss += F.mse_loss(masked_pred, masked_target, reduction='sum') / (state_mask.sum() + 1e-8)
+        
+        # 3. Final power loss (using Viterbi-decoded states)
+        best_states = crf.viterbi_decode(emissions)
+        final_power_pred = torch.zeros_like(y_power)
+        for b in range(batch_size):
+            for t in range(seq_len):
+                state = best_states[b, t]
+                final_power_pred[b, t] = power_preds[b, t, state]
+        
+        final_power_loss = F.mse_loss(final_power_pred, y_power)
+        
+        # Combined loss with weights from the paper
+        total_loss = crf_loss + 0.5 * state_power_loss + final_power_loss
+        
+        return total_loss, crf_loss, state_power_loss, final_power_loss
+
+    def partial_fit(self, train_main, train_appliances, 
+                    do_preprocessing=True, current_epoch=0, **_):
+        """Trains the model on a chunk of data."""
+
+        _log_print("started Partial Fit")
+        
+        # Set appliance parameters if not already done
+        if len(self.appliance_params) == 0:
+            self.set_appliance_params(train_appliances)
+        
+        # Preprocess data
+        if do_preprocessing:
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+            
+        _log_print("Preprocessing done")
+        
+        # Prepare main power data
+        mains_arr = pd.concat(train_main, axis=0).values
+        if len(mains_arr.shape) == 2:
+            mains_arr = mains_arr.reshape(-1, self.sequence_length, 1)
+        else:
+            mains_arr = mains_arr.reshape(-1, self.sequence_length, 1)
+        
+        # Prepare appliance data
+        new_train_appliances = []
+        for app_name, app_dfs in train_appliances:
+            app_df = pd.concat(app_dfs, axis=0)
+            app_df_values = app_df.values
+            new_train_appliances.append((app_name, app_df_values))
+        
+        train_appliances = new_train_appliances
+        
+        # Train a separate model for each appliance
+        for appliance_name, app_data in train_appliances:
+            _log_print(f"\nTraining MSDC for {appliance_name}...")
+            
+            # Initialize model if not already trained
+            if appliance_name not in self.models:
+                self.models[appliance_name] = self.return_network(appliance_name)
+            
+            model = self.models[appliance_name]
+            optimizer = optim.Adam(model.parameters(), lr=self.learning_rate)
+            
+            # Convert data to tensors
+            mains_tensor = torch.FloatTensor(mains_arr).to(self.device)
+            app_tensor = torch.FloatTensor(app_data).to(self.device)
+            
+            # Create state labels
+            state_labels = []
+            for i in range(app_data.shape[0]):
+                states = self._create_state_labels(app_data[i], appliance_name)
+                state_labels.append(states)
+            state_labels = np.array(state_labels)
+            state_tensor = torch.LongTensor(state_labels).to(self.device)
+            
+            # Create dataset and dataloader
+            dataset = TensorDataset(mains_tensor, app_tensor, state_tensor)
+            dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
+            
+            # Training loop
+            model.train()
+            _log_print(f"Training on {self.device}...")
+            for epoch in range(self.n_epochs):
+                _log_print(f"Epoch {epoch + 1}/{self.n_epochs} for {appliance_name}")
+                total_loss = 0
+                batch_count = 0
+                for batch_mains, batch_app, batch_states in dataloader:
+                    optimizer.zero_grad()
+                    
+                    # Forward pass
+                    emissions, power_preds = model(batch_mains)
+                    
+                    # Compute loss
+                    loss, crf_loss, state_power_loss, final_power_loss = self._compute_msdc_loss(
+                        model, batch_mains, batch_app.squeeze(-1), batch_states, appliance_name
+                    )
+                    
+                    # Backward pass and optimization
+                    loss.backward()
+                    optimizer.step()
+                    
+                    total_loss += loss.item()
+                    batch_count += 1
+                
+                if epoch % 10 == 0:
+                    avg_loss = total_loss / batch_count
+                    _log_print(f"Epoch {epoch}/{self.n_epochs}, Avg Loss: {avg_loss:.4f}")
+            
+            _log_print(f"Training completed for {appliance_name}!")
+    
+    def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
+        """Disaggregates a chunk of mains data using the trained models."""
+        
+        if model is not None:
+            self.models = model
+        
+        # Preprocess test data
+        if do_preprocessing:
+            test_main_list = self.call_preprocessing(test_main_list, submeters_lst=None, method='test')
+        
+        test_predictions = []
+        for test_main in test_main_list:
+            test_main = test_main.values
+            test_main = test_main.reshape((-1, self.sequence_length, 1))
+            disggregation_dict = {}
+            
+            test_main_tensor = torch.FloatTensor(test_main).to(self.device)
+            
+            for appliance, model in self.models.items():
+                _log_print(f"Predicting {appliance}...")
+                model.eval()
+                
+                with torch.no_grad():
+                    # Forward pass
+                    emissions, power_preds = model(test_main_tensor)
+                    
+                    # Decode state sequence using Viterbi
+                    best_states = model.crf.viterbi_decode(emissions)
+                    
+                    # Get power predictions for the decoded state sequence
+                    batch_size, seq_len = best_states.shape
+                    predicted_power = torch.zeros(batch_size, seq_len, device=self.device)
+                    
+                    for b in range(batch_size):
+                        for t in range(seq_len):
+                            state = best_states[b, t]
+                            predicted_power[b, t] = power_preds[b, t, state]
+                    
+                    # Extract center values (middle of each window)
+                    center_idx = self.sequence_length // 2
+                    pred = predicted_power[:, center_idx].cpu().numpy()
+                    
+                    # Denormalize predictions
+                    pred = pred * self.appliance_params[appliance]['std'] + self.appliance_params[appliance]['mean']
+                    pred = np.where(pred > 0, pred, 0)  # Ensure non-negative power
+                
+                disggregation_dict[appliance] = pred
+            
+            test_predictions.append(pd.DataFrame(disggregation_dict, dtype='float32'))
+        
+        return test_predictions
+    
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """
+        Preprocessing method required by NILMTK API
+        """
+        if method == 'train':
+            # Process mains data
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            
+            # Process appliance data
+            appliance_list = []
+            for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
+                if app_name in self.appliance_params:
+                    app_mean = self.appliance_params[app_name]['mean']
+                    app_std = self.appliance_params[app_name]['std']
+                else:
+                    raise ApplianceNotFoundError()
+                
+                processed_app_dfs = []
+                for app_df in app_df_lst:
+                    new_app_readings = app_df.values.flatten()
+                    new_app_readings = np.pad(new_app_readings, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                    new_app_readings = np.array([new_app_readings[i:i + n] for i in range(len(new_app_readings) - n + 1)])
+                    new_app_readings = (new_app_readings - app_mean) / app_std
+                    processed_app_dfs.append(pd.DataFrame(new_app_readings))
+                
+                appliance_list.append((app_name, processed_app_dfs))
+            
+            return processed_mains_lst, appliance_list
+        
+        else:  # method == 'test'
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                new_mains = new_mains.reshape((-1, self.sequence_length))
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            return processed_mains_lst
+
+# Export for nilmtk_contrib
+__all__ = ['MSDC']
\ No newline at end of file
diff --git a/nilmtk_contrib/torch/msdc_without_crf.py b/nilmtk_contrib/torch/msdc_without_crf.py
new file mode 100644
index 0000000..e5f81a0
--- /dev/null
+++ b/nilmtk_contrib/torch/msdc_without_crf.py
@@ -0,0 +1,653 @@
+from collections import OrderedDict
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+from nilmtk.disaggregate import Disaggregator
+
+
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
+class SequenceLengthError(Exception):
+    pass
+
+
+class ApplianceNotFoundError(Exception):
+    pass
+
+
+class MSDCNet(nn.Module):
+    """
+    MSDC Neural Network with a dual-branch CNN architecture.
+    This model is based on the S2S_state model from the official MSDC repository.
+    
+    - Branch 1: Predicts power consumption for each appliance state.
+    - Branch 2: Predicts the appliance state.
+    """
+    
+    def __init__(self, window_length, out_len, num_states):
+        super(MSDCNet, self).__init__()
+        self.window_length = window_length
+        self.out_len = out_len
+        self.num_states = num_states
+        
+        # Power branch (Branch 1) - following original MSDC architecture
+        self.conv1_p = nn.Conv1d(1, 30, 13, padding=6)
+        self.conv2_p = nn.Conv1d(30, 30, 11, padding=5)
+        self.conv3_p = nn.Conv1d(30, 40, 7, padding=3)
+        self.conv4_p = nn.Conv1d(40, 50, 5, padding=2)
+        self.conv5_p = nn.Conv1d(50, 60, 5, padding=2)
+        self.conv6_p = nn.Conv1d(60, 60, 5, padding=2)
+        self.fc1_p = nn.Linear(60 * window_length, 1024)
+        self.fc2_p = nn.Linear(1024, out_len * num_states)
+        
+        # State branch (Branch 2) - following original MSDC architecture
+        self.conv1_s = nn.Conv1d(1, 30, 13, padding=6)
+        self.conv2_s = nn.Conv1d(30, 30, 11, padding=5)
+        self.conv3_s = nn.Conv1d(30, 40, 7, padding=3)
+        self.conv4_s = nn.Conv1d(40, 50, 5, padding=2)
+        self.conv5_s = nn.Conv1d(50, 60, 5, padding=2)
+        self.conv6_s = nn.Conv1d(60, 60, 5, padding=2)
+        self.fc1_s = nn.Linear(60 * window_length, 1024)
+        self.fc2_s = nn.Linear(1024, out_len * num_states)
+    
+    def forward(self, x):
+        """
+        Args:
+            x: Input tensor of shape (batch_size, window_length)
+        
+        Returns:
+            power_preds: Power predictions for each state (batch_size, out_len * num_states)
+            state_preds: State classification scores (batch_size, out_len * num_states)
+        """
+        # Add channel dimension
+        x = x.unsqueeze(1)  # (batch_size, 1, window_length)
+        y = x
+        
+        # Power branch
+        x = F.relu(self.conv1_p(x))
+        x = F.relu(self.conv2_p(x))
+        x = F.relu(self.conv3_p(x))
+        x = F.relu(self.conv4_p(x))
+        x = F.relu(self.conv5_p(x))
+        x = F.relu(self.conv6_p(x))
+        x = x.flatten(-2, -1)
+        x = F.relu(self.fc1_p(x))
+        power_preds = self.fc2_p(x)
+        
+        # State branch
+        y = F.relu(self.conv1_s(y))
+        y = F.relu(self.conv2_s(y))
+        y = F.relu(self.conv3_s(y))
+        y = F.relu(self.conv4_s(y))
+        y = F.relu(self.conv5_s(y))
+        y = F.relu(self.conv6_s(y))
+        y = y.flatten(-2, -1)
+        y = F.relu(self.fc1_s(y))
+        state_preds = self.fc2_s(y)
+        
+        return power_preds, state_preds
+
+
+class MSDC(Disaggregator):
+    """
+    Multi-State Dual CNN for non-intrusive load monitoring without CRF layer.
+    
+    This implementation is based on the paper:
+    "MSDC: Exploiting Multi-State Power Consumption in Non-intrusive Load Monitoring based on A Dual-CNN Model"
+    https://arxiv.org/abs/2302.05565
+    
+    The model uses a dual-branch CNN architecture without the CRF layer for joint state 
+    classification and power prediction in energy disaggregation tasks. This version 
+    directly predicts states and power consumption without CRF-based transition modeling.
+    
+    Architecture Overview:
+    - Dual-branch CNN for feature extraction
+    - Branch 1: Power consumption prediction for each state
+    - Branch 2: Direct state classification (without CRF layer)
+    - Multi-state power consumption modeling
+    - Simplified architecture compared to full MSDC model
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Length of input sequences
+            - n_epochs (int): Number of training epochs
+            - batch_size (int): Training batch size
+            - appliance_params (dict): Appliance-specific normalization parameters
+    """
+    
+    # Complete dataset-specific configurations from official MSDC implementation
+    APPLIANCE_STATES = {
+        'kettle': {
+            'uk_dale': {
+                'states': [2000, 4500],
+                'state_averages': [1.15, 2280.79],
+                'num_states': 2,
+                'threshold': 2000
+            }
+            # No REDD config for kettle in original - will fallback to UK-DALE
+        },
+        'microwave': {
+            'uk_dale': {
+                'states': [300, 3000],
+                'state_averages': [1.4, 1551.3],
+                'num_states': 2,
+                'threshold': 300
+            },
+            'redd': {
+                'states': [300, 3000],
+                'state_averages': [4.2, 1557.501],
+                'num_states': 2,
+                'threshold': 300
+            }
+        },
+        'fridge': {
+            'uk_dale': {
+                'states': [20, 200, 2500],
+                'state_averages': [0.13, 87.26, 246.5],
+                'num_states': 3,
+                'threshold': 20
+            },
+            'redd': {
+                'states': [50, 300, 500],
+                'state_averages': [3.2, 143.3, 397.3],
+                'num_states': 3,
+                'threshold': 50
+            },
+            'redd_house1': {
+                'states': [50, 300, 500],
+                'state_averages': [6.49, 192.57, 443],
+                'num_states': 3,
+                'threshold': 50
+            },
+            'redd_house2': {
+                'states': [50, 300, 500],
+                'state_averages': [6.34, 162.87, 418.36],
+                'num_states': 3,
+                'threshold': 50
+            },
+            'redd_house3': {
+                'states': [50, 300, 500],
+                'state_averages': [0.54, 118.85, 409.75],
+                'num_states': 3,
+                'threshold': 50
+            }
+        },
+        'dishwasher': {
+            'uk_dale': {
+                'states': [50, 1000, 4500],
+                'state_averages': [0.89, 122.56, 2324.9],
+                'num_states': 3,
+                'threshold': 50
+            },
+            'redd': {
+                'states': [150, 300, 1000, 3000],
+                'state_averages': [0.57, 232.91, 733.89, 1198.31],
+                'num_states': 4,
+                'threshold': 150
+            },
+            'redd_house1': {
+                'states': [150, 300, 1000, 3000],
+                'state_averages': [0.21, 216.75, 438.51, 1105.08],
+                'num_states': 4,
+                'threshold': 150
+            },
+            'redd_house2': {
+                'states': [150, 1000, 3000],
+                'state_averages': [0.16, 250.26, 1197.93],
+                'num_states': 3,
+                'threshold': 150
+            },
+            'redd_house3': {
+                'states': [50, 400, 1000],
+                'state_averages': [0.97, 195.6, 743.42],
+                'num_states': 3,
+                'threshold': 50
+            }
+        },
+        'washing machine': {
+            'uk_dale': {
+                'states': [50, 800, 3500],
+                'state_averages': [0.13, 204.64, 1892.85],
+                'num_states': 3,
+                'threshold': 50
+            },
+            'uk_dale_house2': {
+                'states': [50, 200, 1000, 4000],
+                'state_averages': [2.83, 114.34, 330.25, 2100.14],
+                'num_states': 4,
+                'threshold': 50
+            },
+            'redd': {
+                'states': [500, 5000],
+                'state_averages': [0, 2627.3],
+                'num_states': 2,
+                'threshold': 500
+            }
+        }
+    }
+    
+    # Dataset-specific normalization parameters
+    DATASET_NORMALIZATION = {
+        'uk_dale': {
+            'mains_mean': 1800,
+            'mains_std': 600
+        },
+        'redd': {
+            'mains_mean': 352.32,  # From official MSDC REDD implementation
+            'mains_std': 608.42
+        }
+    }
+    
+    def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        super().__init__()
+        
+        self.MODEL_NAME = "MSDC"
+        self.file_prefix = f"{self.MODEL_NAME.lower()}-temp-weights"
+        
+        # Dataset configuration
+        self.dataset = params.get('dataset', 'uk_dale').lower()
+        self.house = params.get('house', None)
+        
+        # Validate dataset
+        if self.dataset not in ['uk_dale', 'redd']:
+            _log_print(f"Warning: Unknown dataset '{self.dataset}'. Defaulting to 'uk_dale'.")
+            self.dataset = 'uk_dale'
+        
+        # Build dataset key for configuration lookup
+        if self.house is not None:
+            self.dataset_key = f"{self.dataset}_house{self.house}"
+        else:
+            self.dataset_key = self.dataset
+        
+        # Extract hyperparameters
+        self.sequence_length = params.get('sequence_length', 99)
+        if self.sequence_length % 2 == 0:
+            raise SequenceLengthError("Sequence length must be odd")
+            
+        # Output length for sequence-to-sequence prediction
+        self.out_len = params.get('out_len', 64)
+        self.num_states = params.get('num_states', 3)  # Will be overridden by appliance config
+        self.n_epochs = params.get('n_epochs', 50)
+        self.batch_size = params.get('batch_size', 256)
+        self.learning_rate = params.get('learning_rate', 0.001)
+        self.patience = params.get('patience', 5)
+        
+        # Dataset-specific normalization parameters
+        dataset_norm = self.DATASET_NORMALIZATION.get(self.dataset, self.DATASET_NORMALIZATION['uk_dale'])
+        self.mains_mean = params.get('mains_mean', dataset_norm['mains_mean'])
+        self.mains_std = params.get('mains_std', dataset_norm['mains_std'])
+        self.appliance_params = params.get('appliance_params', {})
+        
+        # Model storage
+        self.models = OrderedDict()  # Store separate models for each appliance
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        
+        # Display configuration
+        _log_print(f"MSDC initialized for dataset: {self.dataset.upper()}")
+        if self.house:
+            _log_print(f"House: {self.house}")
+        _log_print(f"Configuration key: {self.dataset_key}")
+        _log_print(f"Mains normalization - mean: {self.mains_mean}, std: {self.mains_std}")
+    
+    def _get_appliance_config(self, appliance_name):
+        """Get the best available configuration for an appliance"""
+        if appliance_name not in self.APPLIANCE_STATES:
+            return None
+        
+        appliance_configs = self.APPLIANCE_STATES[appliance_name]
+        
+        # Priority order: dataset_key -> dataset -> any available
+        if self.dataset_key in appliance_configs:
+            return appliance_configs[self.dataset_key]
+        elif self.dataset in appliance_configs:
+            return appliance_configs[self.dataset]
+        else:
+            # Use any available configuration as fallback
+            available_configs = list(appliance_configs.keys())
+            if available_configs:
+                fallback_key = available_configs[0]
+                _log_print(f"Warning: No {self.dataset_key} config for {appliance_name}, using {fallback_key}")
+                return appliance_configs[fallback_key]
+        
+        return None
+    
+    def return_network(self, appliance_name):
+        """Factory method to create a new MSDC model instance for specific appliance"""
+        config = self._get_appliance_config(appliance_name)
+        if config:
+            num_states = config['num_states']
+            _log_print(f"Creating network for {appliance_name} with {num_states} states ({self.dataset_key})")
+        else:
+            num_states = self.num_states  # fallback to default
+            _log_print(f"Warning: No config found for {appliance_name}, using default {num_states} states")
+        
+        return MSDCNet(self.sequence_length, self.out_len, num_states).to(self.device)
+    
+    def set_appliance_params(self, train_appliances):
+        """Compute normalization statistics for each appliance from training data"""
+        for name, lst in train_appliances:
+            # Always compute normalization from training data
+            arr = pd.concat(lst, axis=0).values.flatten()
+            m, s = arr.mean(), arr.std()
+            # Prevent division by zero
+            if s < 1:
+                s = 100
+            _log_print(f"Computed normalization for {name}: mean={m:.2f}, std={s:.2f}")
+            
+            self.appliance_params[name] = {'mean': m, 'std': s}
+    
+    def _create_state_labels(self, power_sequence, appliance_name):
+        """
+        Create state labels using the dataset-specific state dictionary
+        """
+        power = power_sequence.flatten()
+        
+        # Get appliance configuration
+        config = self._get_appliance_config(appliance_name)
+        
+        if config:
+            thresholds = config['states']
+            num_states = config['num_states']
+        else:
+            # Fallback to dynamic thresholds
+            if appliance_name in self.appliance_params:
+                params = self.appliance_params[appliance_name]
+                mean_power = params['mean']
+            else:
+                mean_power = power.mean()
+            
+            num_states = self.num_states
+            
+            if num_states == 2:
+                thresholds = [0.1 * mean_power]
+            elif num_states == 3:
+                thresholds = [0.1 * mean_power, 0.7 * mean_power]
+            else:
+                thresholds = np.linspace(0, mean_power * 1.2, num_states)[1:]
+        
+        # Create state labels based on thresholds
+        states = np.zeros_like(power, dtype=np.int64)
+        
+        for i, threshold in enumerate(thresholds):
+            states[power >= threshold] = i + 1
+        
+        # Ensure states are within valid range
+        states = np.clip(states, 0, num_states - 1)
+        
+        return states.astype(np.int64)
+    
+    def _compute_msdc_loss(self, power_preds, state_preds, y_power, y_states, appliance_name):
+        """
+        Computes the combined loss for the MSDC model.
+        The loss is a sum of:
+        1. Mean Squared Error (MSE) for the final power prediction.
+        2. Cross-entropy loss for the state classification.
+        """
+        batch_size = y_power.shape[0]
+        
+        # Get number of states for this appliance
+        config = self._get_appliance_config(appliance_name)
+        if config:
+            num_states = config['num_states']
+        else:
+            num_states = self.num_states
+        
+        # Reshape predictions: (batch_size, out_len, num_states)
+        power_preds = power_preds.view(batch_size, self.out_len, num_states)
+        state_preds = state_preds.view(batch_size, self.out_len, num_states)
+        
+        # Apply softmax to state predictions to get probabilities
+        state_probs = F.softmax(state_preds, dim=-1)
+        
+        # Final power prediction: weighted sum over states
+        final_power = torch.sum(state_probs * power_preds, dim=-1, keepdim=False)
+        
+        # 1. Final power MSE loss
+        power_loss = F.mse_loss(final_power, y_power)
+        
+        # 2. State classification loss
+        # Flatten for cross-entropy: (batch_size * out_len, num_states)
+        state_preds_flat = state_preds.view(-1, num_states)
+        y_states_flat = y_states.view(-1)
+        state_loss = F.cross_entropy(state_preds_flat, y_states_flat)
+        
+        # Combined loss (following original implementation)
+        total_loss = power_loss + state_loss
+        
+        return total_loss, power_loss, state_loss
+
+    def partial_fit(self, train_main, train_appliances, 
+                    do_preprocessing=True, current_epoch=0, **_):
+        """Train MSDC models on a chunk of data"""
+
+        _log_print("Started Partial Fit")
+        
+        # Compute appliance parameters if not provided
+        if len(self.appliance_params) == 0:
+            self.set_appliance_params(train_appliances)
+        
+        _log_print("Preprocessing called")
+        # Preprocess data using NILMTK-compatible method
+        if do_preprocessing:
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+            
+        _log_print("Preprocessing done")
+        
+        # Prepare main power data
+        mains_arr = pd.concat(train_main, axis=0).values
+        if len(mains_arr.shape) == 2:
+            mains_arr = mains_arr.reshape(-1, self.sequence_length)
+        else:
+            mains_arr = mains_arr.reshape(-1, self.sequence_length)
+        
+        # Prepare appliance data
+        new_train_appliances = []
+        for app_name, app_dfs in train_appliances:
+            app_df = pd.concat(app_dfs, axis=0)
+            app_df_values = app_df.values
+            if len(app_df_values.shape) == 2:
+                app_df_values = app_df_values.reshape(-1, self.out_len)
+            else:
+                app_df_values = app_df_values.reshape(-1, self.out_len)
+            new_train_appliances.append((app_name, app_df_values))
+        
+        train_appliances = new_train_appliances
+        
+        # Train a separate model for each appliance
+        for appliance_name, app_data in train_appliances:
+            _log_print(f"\nTraining {appliance_name} for {self.dataset_key}...")
+            
+            # Check if the appliance was already trained
+            if appliance_name not in self.models:
+                self.models[appliance_name] = self.return_network(appliance_name)
+            
+            model = self.models[appliance_name]
+            optimizer = optim.Adam(model.parameters(), lr=self.learning_rate)
+            
+            # Convert to tensors
+            mains_tensor = torch.FloatTensor(mains_arr).to(self.device)
+            app_tensor = torch.FloatTensor(app_data).to(self.device)
+            
+            # Create state labels for each sequence using dataset-specific states
+            state_labels = []
+            for i in range(app_data.shape[0]):
+                states = self._create_state_labels(app_data[i], appliance_name)
+                state_labels.append(states)
+            state_labels = np.array(state_labels)
+            state_tensor = torch.LongTensor(state_labels).to(self.device)
+            
+            # Create dataset and dataloader
+            dataset = TensorDataset(mains_tensor, app_tensor, state_tensor)
+            dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
+            
+            # Training loop
+            model.train()
+            _log_print("Training loop started")
+            for epoch in range(self.n_epochs):
+                _log_print(f"Epoch {epoch + 1}/{self.n_epochs} for {appliance_name}")
+                total_loss = 0
+                batch_count = 0
+                for batch_mains, batch_app, batch_states in dataloader:
+                    optimizer.zero_grad()
+                    
+                    # Forward pass through MSDC network
+                    power_preds, state_preds = model(batch_mains)
+                    
+                    # Compute MSDC loss (without CRF)
+                    loss, power_loss, state_loss = self._compute_msdc_loss(
+                        power_preds, state_preds, batch_app, batch_states, appliance_name
+                    )
+                    
+                    # Backward pass
+                    loss.backward()
+                    optimizer.step()
+                    
+                    total_loss += loss.item()
+                    batch_count += 1
+                
+                if epoch % 10 == 0:
+                    avg_loss = total_loss / batch_count
+                    _log_print(f"Epoch {epoch}/{self.n_epochs}, Avg Loss: {avg_loss:.4f}")
+    
+    def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
+        """Disaggregate power consumption using the trained MSDC model."""
+        
+        if model is not None:
+            self.models = model
+        
+        # Preprocess the test mains
+        if do_preprocessing:
+            test_main_list = self.call_preprocessing(test_main_list, submeters_lst=None, method='test')
+        
+        test_predictions = []
+        for test_main in test_main_list:
+            test_main = test_main.values
+            test_main = test_main.reshape((-1, self.sequence_length))
+            disggregation_dict = {}
+            
+            test_main_tensor = torch.FloatTensor(test_main).to(self.device)
+            
+            for appliance in self.models:
+                model = self.models[appliance]
+                model.eval()
+                
+                # Get appliance configuration
+                config = self._get_appliance_config(appliance)
+                if config:
+                    num_states = config['num_states']
+                else:
+                    num_states = self.num_states
+                
+                with torch.no_grad():
+                    # Forward pass through MSDC
+                    power_preds, state_preds = model(test_main_tensor)
+                    
+                    # Reshape predictions
+                    batch_size = power_preds.shape[0]
+                    power_preds = power_preds.view(batch_size, self.out_len, num_states)
+                    state_preds = state_preds.view(batch_size, self.out_len, num_states)
+                    
+                    # Apply softmax to get state probabilities
+                    state_probs = F.softmax(state_preds, dim=-1)
+                    
+                    # Final power prediction: weighted sum over states
+                    predicted_power = torch.sum(state_probs * power_preds, dim=-1)
+                    
+                    # Extract center values (middle of each window)
+                    center_idx = self.out_len // 2
+                    pred = predicted_power[:, center_idx].cpu().numpy()
+                    
+                    # Denormalize predictions
+                    pred = pred * self.appliance_params[appliance]['std'] + self.appliance_params[appliance]['mean']
+                    pred = np.where(pred > 0, pred, 0)  # Ensure non-negative power
+                
+                disggregation_dict[appliance] = pred
+            
+            test_predictions.append(pd.DataFrame(disggregation_dict, dtype='float32'))
+        
+        return test_predictions
+    
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """
+        Preprocessing method required by NILMTK API
+        """
+        if method == 'train':
+            # Process mains data
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            
+            # Process appliance data - create sequence-to-sequence targets
+            appliance_list = []
+            for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
+                if app_name in self.appliance_params:
+                    app_mean = self.appliance_params[app_name]['mean']
+                    app_std = self.appliance_params[app_name]['std']
+                else:
+                    raise ApplianceNotFoundError()
+                
+                processed_app_dfs = []
+                for app_df in app_df_lst:
+                    new_app_readings = app_df.values.flatten()
+                    n = self.sequence_length
+                    units_to_pad = n // 2
+                    new_app_readings = np.pad(new_app_readings, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                    
+                    # Create sequence-to-sequence targets (out_len length)
+                    app_sequences = []
+                    offset = int(0.5 * (self.sequence_length - 1.0))
+                    for i in range(len(new_app_readings) - self.sequence_length + 1):
+                        # Extract output sequence from center
+                        start_idx = i + offset - self.out_len // 2
+                        end_idx = start_idx + self.out_len
+                        if start_idx >= 0 and end_idx <= len(new_app_readings):
+                            seq = new_app_readings[start_idx:end_idx]
+                        else:
+                            # Pad if necessary
+                            seq = np.zeros(self.out_len)
+                            if start_idx < 0:
+                                seq[-start_idx:] = new_app_readings[0:end_idx]
+                            elif end_idx > len(new_app_readings):
+                                seq[:len(new_app_readings)-start_idx] = new_app_readings[start_idx:]
+                            else:
+                                seq = new_app_readings[start_idx:end_idx]
+                        
+                        app_sequences.append(seq)
+                    
+                    app_sequences = np.array(app_sequences)
+                    app_sequences = (app_sequences - app_mean) / app_std
+                    processed_app_dfs.append(pd.DataFrame(app_sequences))
+                
+                appliance_list.append((app_name, processed_app_dfs))
+            
+            return processed_mains_lst, appliance_list
+        
+        else:  # method == 'test'
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                new_mains = new_mains.reshape((-1, self.sequence_length))
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            return processed_mains_lst
+
+# Export for nilmtk_contrib
+__all__ = ['MSDC']
diff --git a/nilmtk_contrib/torch/nilmformer.py b/nilmtk_contrib/torch/nilmformer.py
new file mode 100644
index 0000000..628f391
--- /dev/null
+++ b/nilmtk_contrib/torch/nilmformer.py
@@ -0,0 +1,1039 @@
+"""
+NILMFormer: PyTorch Implementation for NILMTK-Contrib
+
+This is a NILMFormer-inspired implementation based on the paper:
+"NILMFormer: Non-Intrusive Load Monitoring that Accounts for Non-Stationarity"
+by Petralia et al. (ACM SIGKDD 2025)
+
+Official GitHub: https://github.com/adrienpetralia/NILMFormer
+Paper: https://arxiv.org/html/2506.05880v1
+
+Architecture components to audit against the official implementation:
+1. Instance Normalization: Stationarizes input by subtracting mean/std
+2. DilatedBlock: Robust convolutional feature extractor with residual connections
+3. TokenStats: Linear projection of mean/std statistics into higher dimensional space
+4. Exogenous Features: Temporal encoding using create_exogene (sinusoidal functions for
+   month, day-of-week, hour, minute)
+5. Transformer Encoder: Diagonal masked self-attention with pre-norm architecture
+6. Output Head: 1D convolution for sequence-to-sequence prediction
+7. Denormalization: Reverse instance normalization using projected statistics
+
+Key Features:
+- create_exogene for capturing temporal patterns (from original NILMFormer repo)
+- Diagonal masking (not causal) in self-attention
+- GELU activations throughout
+- Pre-norm transformer blocks
+- Instance normalization for non-stationarity handling
+- Sequence-to-sequence prediction with middle-point extraction
+- Parameter defaults intended to track the official config (d_model=96, n_heads=8, etc.)
+
+This implementation adapts NILMFormer concepts to the NILMTK-Contrib
+Disaggregator interface. Source parity must be verified before making
+reproduction claims.
+"""
+
+from typing import List, Optional
+from collections import OrderedDict
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from tqdm import tqdm
+from nilmtk.disaggregate import Disaggregator
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
+
+
+class SequenceLengthError(Exception):
+    pass
+
+
+class ApplianceNotFoundError(Exception):
+    pass
+
+
+class NILMDataset(Dataset):
+    """
+    Dataset class for NILMFormer.
+    """
+    def __init__(self, inputs, targets):
+        """
+        Args:
+            inputs (Tensor): Input tensor of shape (B, C, L), where C includes
+                             mains power and exogenous features.
+            targets (Tensor): Target tensor of shape (B, C_out, L), where C_out
+                              is the number of appliances.
+        """
+        self.inputs = inputs
+        self.targets = targets
+        
+    def __len__(self):
+        return len(self.inputs)
+    
+    def __getitem__(self, idx):
+        return self.inputs[idx], self.targets[idx]
+
+
+class ResUnit(nn.Module):
+    """
+    Residual Unit for the NILMFormer model.
+    """
+    def __init__(self, c_in: int, c_out: int, k: int = 8, dilation: int = 1, 
+                 stride: int = 1, bias: bool = True):
+        super().__init__()
+
+        self.layers = nn.Sequential(
+            nn.Conv1d(
+                in_channels=c_in,
+                out_channels=c_out,
+                kernel_size=k,
+                dilation=dilation,
+                stride=stride,
+                bias=bias,
+                padding="same",
+            ),
+            nn.GELU(),
+            nn.BatchNorm1d(c_out),
+        )
+        
+        if c_in > 1 and c_in != c_out:
+            self.match_residual = True
+            self.conv = nn.Conv1d(in_channels=c_in, out_channels=c_out, kernel_size=1)
+        else:
+            self.match_residual = False
+
+    def forward(self, x) -> torch.Tensor:
+        if self.match_residual:
+            x_bottleneck = self.conv(x)
+            x = self.layers(x)
+            return torch.add(x_bottleneck, x)
+        else:
+            return torch.add(x, self.layers(x))
+
+
+class DilatedBlock(nn.Module):
+    """
+    Dilated Convolutional Block for feature extraction.
+    """
+    def __init__(self, c_in: int = 1, c_out: int = 72, kernel_size: int = 8,
+                 dilation_list: Optional[List[int]] = None, bias: bool = True):
+        super().__init__()
+        
+        if dilation_list is None:
+            dilation_list = [1, 2, 4, 8]
+
+        layers = []
+        for i, dilation in enumerate(dilation_list):
+            if i == 0:
+                layers.append(
+                    ResUnit(c_in, c_out, k=kernel_size, dilation=dilation, bias=bias)
+                )
+            else:
+                layers.append(
+                    ResUnit(c_out, c_out, k=kernel_size, dilation=dilation, bias=bias)
+                )
+        self.network = torch.nn.Sequential(*layers)
+
+    def forward(self, x) -> torch.Tensor:
+        return self.network(x)
+
+
+def create_exogene(start_date, sequence_length, freq="1min", 
+                   list_exo_variables=None, cosinbase=True, new_range=(-1, 1)):
+    """
+    Creates exogenous temporal features.
+    
+    Args:
+        start_date: The starting timestamp for the sequence.
+        sequence_length: The length of the time sequence.
+        freq: The frequency of the data sampling.
+        list_exo_variables: A list of temporal features to generate.
+        cosinbase: If True, uses sinusoidal encoding for features.
+        new_range: The range for normalization if cosinbase is False.
+    
+    Returns:
+        An array of exogenous features.
+    """
+    if list_exo_variables is None:
+        list_exo_variables = ['month', 'dow', 'hour', 'minute']  # Default temporal features
+    
+    if cosinbase:
+        n_var = 2 * len(list_exo_variables)  # sin and cos for each variable
+    else:
+        n_var = len(list_exo_variables)
+    
+    # Create datetime range
+    if isinstance(start_date, str):
+        start_date = pd.to_datetime(start_date)
+    
+    tmp = pd.date_range(start=start_date, periods=sequence_length, freq=freq)
+    
+    # Initialize exogenous features array
+    np_extra = np.zeros((1, n_var, sequence_length)).astype(np.float32)
+    
+    k = 0
+    for exo_var in list_exo_variables:
+        if exo_var == "month":
+            if cosinbase:
+                np_extra[0, k, :] = np.sin(2 * np.pi * tmp.month.values / 12.0)
+                np_extra[0, k + 1, :] = np.cos(2 * np.pi * tmp.month.values / 12.0)
+                k += 2
+            else:
+                np_extra[0, k, :] = normalize_exogene(
+                    tmp.month.values, xmin=1, xmax=12, newRange=new_range
+                )
+                k += 1
+        elif exo_var == "dom":  # day of month
+            if cosinbase:
+                np_extra[0, k, :] = np.sin(2 * np.pi * tmp.day.values / 31.0)
+                np_extra[0, k + 1, :] = np.cos(2 * np.pi * tmp.day.values / 31.0)
+                k += 2
+            else:
+                np_extra[0, k, :] = normalize_exogene(
+                    tmp.day.values, xmin=1, xmax=31, newRange=new_range
+                )
+                k += 1
+        elif exo_var == "dow":  # day of week
+            if cosinbase:
+                np_extra[0, k, :] = np.sin(2 * np.pi * tmp.dayofweek.values / 7.0)
+                np_extra[0, k + 1, :] = np.cos(2 * np.pi * tmp.dayofweek.values / 7.0)
+                k += 2
+            else:
+                np_extra[0, k, :] = normalize_exogene(
+                    tmp.dayofweek.values, xmin=0, xmax=6, newRange=new_range
+                )
+                k += 1
+        elif exo_var == "hour":
+            if cosinbase:
+                np_extra[0, k, :] = np.sin(2 * np.pi * tmp.hour.values / 24.0)
+                np_extra[0, k + 1, :] = np.cos(2 * np.pi * tmp.hour.values / 24.0)
+                k += 2
+            else:
+                np_extra[0, k, :] = normalize_exogene(
+                    tmp.hour.values, xmin=0, xmax=23, newRange=new_range
+                )
+                k += 1
+        elif exo_var == "minute":
+            if cosinbase:
+                np_extra[0, k, :] = np.sin(2 * np.pi * tmp.minute.values / 60.0)
+                np_extra[0, k + 1, :] = np.cos(2 * np.pi * tmp.minute.values / 60.0)
+                k += 2
+            else:
+                np_extra[0, k, :] = normalize_exogene(
+                    tmp.minute.values, xmin=0, xmax=59, newRange=new_range
+                )
+                k += 1
+        else:
+            raise ValueError(
+                f"Embedding unknown for these Data. Only 'month', 'dow', 'dom', 'hour', 'minute' supported, received {exo_var}"
+            )
+    
+    return np_extra
+
+
+def normalize_exogene(x, xmin, xmax, newRange):
+    """
+    Normalizes exogenous features to a specified range.
+    """
+    if xmin is None:
+        xmin = np.min(x)
+    if xmax is None:
+        xmax = np.max(x)
+    
+    norm = (x - xmin) / (xmax - xmin)
+    if newRange == (0, 1):
+        return norm
+    elif newRange != (0, 1):
+        return norm * (newRange[1] - newRange[0]) + newRange[0]
+
+
+class DiagonalMaskFromSeqlen:
+    """
+    Creates a diagonal attention mask.
+    """
+    def __init__(self, B, L, device="cpu"):
+        with torch.no_grad():
+            self._mask = torch.diag(
+                torch.ones(L, dtype=torch.bool, device=device)
+            ).repeat(B, 1, 1, 1)
+
+    @property
+    def mask(self) -> torch.Tensor:
+        return self._mask
+
+
+class DiagonallyMaskedSelfAttention(nn.Module):
+    """
+    Self-attention mechanism with a diagonal mask.
+    """
+    def __init__(self, dim: int, n_heads: int, head_dim: int, dropout: float):
+        super().__init__()
+
+        self.n_heads: int = n_heads
+        self.head_dim: int = head_dim
+        self.dropout: float = dropout
+        self.scale = head_dim**-0.5
+
+        self.attn_dropout = nn.Dropout(dropout)
+        self.out_dropout = nn.Dropout(dropout)
+
+        self.wq = nn.Linear(dim, n_heads * head_dim, bias=False)
+        self.wk = nn.Linear(dim, n_heads * head_dim, bias=False)
+        self.wv = nn.Linear(dim, n_heads * head_dim, bias=False)
+        self.wo = nn.Linear(n_heads * head_dim, dim, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch, seqlen, _ = x.shape
+
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+
+        xq = xq.view(batch, seqlen, self.n_heads, self.head_dim)
+        xk = xk.view(batch, seqlen, self.n_heads, self.head_dim)
+        xv = xv.view(batch, seqlen, self.n_heads, self.head_dim)
+
+        diag_mask = DiagonalMaskFromSeqlen(batch, seqlen, device=xq.device)
+
+        scale = 1.0 / xq.shape[-1] ** 0.5
+        scores = torch.einsum("blhe,bshe->bhls", xq, xk)
+        attn = self.attn_dropout(
+            torch.softmax(
+                scale * scores.masked_fill_(diag_mask.mask, -np.inf), dim=-1
+            )
+        )
+        output = torch.einsum("bhls,bshd->blhd", attn, xv)
+
+        return self.out_dropout(self.wo(output.reshape(batch, seqlen, -1)))
+
+
+class PositionWiseFeedForward(nn.Module):
+    """
+    Position-wise feed-forward network.
+    """
+    def __init__(self, dim: int, hidden_dim: int, dp_rate: float = 0.0, 
+                 bias1: bool = True, bias2: bool = True):
+        super().__init__()
+        self.layer1 = nn.Linear(dim, hidden_dim, bias=bias1)
+        self.layer2 = nn.Linear(hidden_dim, dim, bias=bias2)
+        self.dropout = nn.Dropout(dp_rate)
+        self.activation = F.gelu
+
+    def forward(self, x) -> torch.Tensor:
+        x = self.layer2(self.dropout(self.activation(self.layer1(x))))
+        return x
+
+
+class EncoderLayer(nn.Module):
+    """
+    Transformer encoder layer with pre-norm architecture.
+    """
+    def __init__(self, d_model: int, n_heads: int, dp_rate: float = 0.2, 
+                 pffn_ratio: int = 4, norm_eps: float = 1e-5):
+        super().__init__()
+        
+        assert d_model % n_heads == 0, (
+            f"d_model ({d_model}) must be divisible by n_heads ({n_heads})"
+        )
+
+        self.attention_layer = DiagonallyMaskedSelfAttention(
+            dim=d_model,
+            n_heads=n_heads,
+            head_dim=d_model // n_heads,
+            dropout=dp_rate,
+        )
+
+        self.norm1 = nn.LayerNorm(d_model, eps=norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=norm_eps)
+        self.dropout = nn.Dropout(dp_rate)
+
+        self.pffn = PositionWiseFeedForward(
+            dim=d_model,
+            hidden_dim=d_model * pffn_ratio,
+            dp_rate=dp_rate,
+        )
+
+    def forward(self, x) -> torch.Tensor:
+        # Pre-norm attention block
+        x = self.norm1(x)
+        new_x = self.attention_layer(x)
+        x = torch.add(x, new_x)
+
+        # Pre-norm PFFN block
+        x = self.norm2(x)
+        new_x = self.pffn(x)
+        x = torch.add(x, self.dropout(new_x))
+
+        return x
+
+
+class NILMFormerNetwork(nn.Module):
+    """
+    The NILMFormer neural network architecture.
+    """
+    def __init__(self, c_in=1, c_embedding=8, c_out=1, kernel_size=3, 
+                 kernel_size_head=3, dilations=None, conv_bias=True,
+                 n_encoder_layers=3, d_model=96, dp_rate=0.2, pffn_ratio=4,
+                 n_heads=8, norm_eps=1e-5):
+        super().__init__()
+        
+        if dilations is None:
+            dilations = [1, 2, 4, 8]
+            
+        # Validate constraints
+        assert d_model % 4 == 0, "d_model must be divisible by 4."
+        
+        # Store config
+        self.d_model = d_model
+        self.c_out = c_out
+        
+        # ============ Embedding ============#
+        d_model_ = 3 * d_model // 4  # e.g., if d_model=96 => d_model_=72
+
+        self.EmbedBlock = DilatedBlock(
+            c_in=c_in,
+            c_out=d_model_,
+            kernel_size=kernel_size,
+            dilation_list=dilations,
+            bias=conv_bias,
+        )
+
+        # Exogenous input projection (from create_exogene features)
+        self.ProjEmbedding = nn.Conv1d(
+            in_channels=c_embedding, 
+            out_channels=d_model // 4, 
+            kernel_size=1
+        )
+
+        self.ProjStats1 = nn.Linear(2, d_model)
+        self.ProjStats2 = nn.Linear(d_model, 2)
+
+        # ============ Encoder ============#
+        layers = []
+        for _ in range(n_encoder_layers):
+            layers.append(EncoderLayer(d_model, n_heads, dp_rate, pffn_ratio, norm_eps))
+        layers.append(nn.LayerNorm(d_model))
+        self.EncoderBlock = nn.Sequential(*layers)
+
+        # ============ Downstream Task Head ============#
+        self.DownstreamTaskHead = nn.Conv1d(
+            in_channels=d_model,
+            out_channels=c_out,
+            kernel_size=kernel_size_head,
+            padding=kernel_size_head // 2,
+            padding_mode="replicate",
+        )
+
+        # ============ Initialize Weights ============#
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        """
+        Initializes the weights of the linear and layer normalization layers.
+        """
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            torch.nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x) -> torch.Tensor:
+        """
+        Forward pass for the NILMFormer model.
+        
+        Args:
+            x (Tensor): Input tensor of shape (B, 1 + e, L), where B is the batch size,
+                        e is the number of exogenous features, and L is the sequence length.
+        
+        Returns:
+            Tensor: The output of the model.
+        """
+        # Separate the channels:
+        #   x[:, :1, :] => load curve
+        #   x[:, 1:, :] => exogenous input(s)
+        encoding = x[:, 1:, :]  # shape: (B, e, L)
+        x = x[:, :1, :]  # shape: (B, 1, L)
+
+        # === Instance Normalization === #
+        inst_mean = torch.mean(x, dim=-1, keepdim=True).detach()
+        inst_std = torch.sqrt(
+            torch.var(x, dim=-1, keepdim=True, unbiased=False) + 1e-6
+        ).detach()
+
+        x = (x - inst_mean) / inst_std  # shape still (B, 1, L)
+
+        # === Embedding === #
+        # 1) Dilated Conv block
+        x = self.EmbedBlock(x)  # shape: (B, [d_model_], L) => typically (B, 72, L) if d_model=96
+        
+        # 2) Project exogenous features
+        encoding = self.ProjEmbedding(encoding)  # shape: (B, d_model//4, L)
+        
+        # 3) Concatenate dilated features with exogenous features
+        x = torch.cat([x, encoding], dim=1).permute(0, 2, 1)  # (B, L, d_model)
+
+        # === Mean/Std tokens === #
+        stats_token = self.ProjStats1(
+            torch.cat([inst_mean, inst_std], dim=1).permute(0, 2, 1)
+        )  # (B, 1, d_model)
+        x = torch.cat([x, stats_token], dim=1)  # (B, L + 1, d_model)
+
+        # === Transformer Encoder === #
+        x = self.EncoderBlock(x)  # (B, L + 1, d_model)
+        x = x[:, :-1, :]  # remove stats token => (B, L, d_model)
+
+        # === Conv Head === #
+        x = x.permute(0, 2, 1)  # (B, d_model, L)
+        x = self.DownstreamTaskHead(x)  # (B, c_out, L)
+
+        # === Reverse Instance Normalization === #
+        # stats_out => shape (B, 1, 2)
+        stats_out = self.ProjStats2(stats_token)  # stats_token was (B, 1, d_model)
+        outinst_mean = stats_out[:, :, 0].unsqueeze(-1)  # (B, 1, 1)
+        outinst_std = stats_out[:, :, 1].unsqueeze(-1)  # (B, 1, 1)
+
+        x = x * outinst_std + outinst_mean
+        return x
+
+
+class NILMFormer(Disaggregator):
+    """
+    NILMFormer: Transformer-based model for non-intrusive load monitoring.
+    
+    This implementation is based on the paper:
+    "NILMFormer: Non-Intrusive Load Monitoring that Accounts for Non-Stationarity"
+    https://arxiv.org/abs/2506.05880
+    
+    The model uses a transformer architecture specifically designed for energy disaggregation 
+    tasks that addresses non-stationarity in power consumption data through instance 
+    normalization and temporal feature encoding.
+    
+    Architecture Overview:
+    - Instance normalization for handling non-stationarity
+    - Dilated convolutional feature extractor with residual connections
+    - Exogenous temporal features (month, day-of-week, hour, minute)
+    - Transformer encoder with diagonal masked self-attention
+    - Sequence-to-sequence prediction with denormalization
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Input sequence length (default: 99)
+            - c_in (int): Input channels (default: 1)
+            - c_embedding (int): Exogenous channels (default: 8)
+            - d_model (int): Model dimension (default: 96)
+            - n_heads (int): Number of attention heads (default: 8)
+            - n_layers (int): Number of transformer layers (default: 6)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+    """
+
+    def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        """
+        Initialize NILMFormer model with specified parameters following the paper
+        
+        Parameters:
+        -----------
+        params : dict
+            Dictionary containing model parameters:
+            - sequence_length: Input sequence length (default: 99)
+            - c_in: Input channels (default: 1) 
+            - c_embedding: Exogenous channels (default: 8)
+            - c_out: Output channels (default: 1)
+            - d_model: Model dimension (default: 96)
+            - n_heads: Number of attention heads (default: 8)
+            - n_encoder_layers: Number of encoder layers (default: 3)
+            - dp_rate: Dropout rate (default: 0.2)
+            - pffn_ratio: Feed-forward expansion ratio (default: 4)
+            - kernel_size: Conv kernel size (default: 3)
+            - dilations: Dilation factors (default: [1, 2, 4, 8])
+            - n_epochs: Training epochs (default: 100)
+            - batch_size: Batch size (default: 1024)
+            - learning_rate: Learning rate (default: 1e-4)
+        """
+        super().__init__()
+        
+        self.MODEL_NAME = "NILMFormer"
+        self.models = OrderedDict()
+        self.file_prefix = f"{self.MODEL_NAME.lower()}-temp-weights"
+        
+        # Model architecture parameters intended to follow NILMFormer defaults.
+        self.sequence_length = params.get('sequence_length', 99)
+        self.c_in = params.get('c_in', 1)
+        self.c_embedding = params.get('c_embedding', 8)
+        self.c_out = params.get('c_out', 1)
+        self.d_model = params.get('d_model', 96)
+        self.n_heads = params.get('n_heads', 8)
+        self.n_encoder_layers = params.get('n_encoder_layers', 3)
+        self.dp_rate = params.get('dp_rate', 0.2)
+        self.pffn_ratio = params.get('pffn_ratio', 4)
+        self.kernel_size = params.get('kernel_size', 3)
+        self.kernel_size_head = params.get('kernel_size_head', 3)
+        self.dilations = params.get('dilations', [1, 2, 4, 8])
+        self.conv_bias = params.get('conv_bias', True)
+        self.norm_eps = params.get('norm_eps', 1e-5)
+        
+        # Training parameters (optimized for NILMFormer)
+        self.chunk_wise_training = params.get('chunk_wise_training', False)
+        self.n_epochs = params.get('n_epochs', 100)  # More epochs for transformer
+        self.batch_size = params.get('batch_size', 1024)  # Larger batch size
+        self.learning_rate = params.get('learning_rate', 1e-4)  # Lower learning rate
+        self.warmup_steps = params.get('warmup_steps', 1000)  # Learning rate warmup
+        
+        # Data parameters
+        self.appliance_params = params.get('appliance_params', {})
+        self.mains_mean = params.get('mains_mean', 1800)
+        self.mains_std = params.get('mains_std', 600)
+        
+        # Device configuration
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        _log_print(f"NILMFormer using device: {self.device}")
+        
+        if self.sequence_length % 2 == 0:
+            _log_print("Sequence length should be odd!")
+            raise SequenceLengthError()
+
+    def return_network(self):
+        """Create and return the NILMFormer-inspired network."""
+        model = NILMFormerNetwork(
+            c_in=self.c_in,
+            c_embedding=self.c_embedding,
+            c_out=self.c_out,
+            kernel_size=self.kernel_size,
+            kernel_size_head=self.kernel_size_head,
+            dilations=self.dilations,
+            conv_bias=self.conv_bias,
+            n_encoder_layers=self.n_encoder_layers,
+            d_model=self.d_model,
+            dp_rate=self.dp_rate,
+            pffn_ratio=self.pffn_ratio,
+            n_heads=self.n_heads,
+            norm_eps=self.norm_eps
+        )
+        return model.to(self.device)
+
+    def create_exogene_features(self, n_samples, sequence_length, start_date=None):
+        """
+        Create exogenous temporal features using the NILMFormer approach.
+        
+        This function generates sinusoidal temporal features from timestamps,
+        following the intended NILMFormer timestamp-feature design.
+        
+        Args:
+            n_samples: Number of samples
+            sequence_length: Length of each sequence  
+            start_date: Starting date (datetime or None for reference date)
+        
+        Returns:
+            exogenous_features: (n_samples, c_embedding, sequence_length) tensor of temporal features
+        """
+        if start_date is None:
+            # Use a reference date (e.g., start of 2023)
+            import datetime
+            start_date = datetime.datetime(2023, 1, 1)
+        
+        # Assume data is sampled every minute (can be adjusted based on dataset)
+        freq = "1min"
+        
+        # Temporal variables to include (following original implementation)
+        list_exo_variables = ['month', 'dow', 'hour', 'minute']  # Standard set
+        
+        all_exogenous = []
+        for i in range(n_samples):
+            # Each sample starts at a different time
+            sample_start = start_date + pd.Timedelta(minutes=i * sequence_length)
+            
+            # Generate exogenous features for this sample
+            exo_features = create_exogene(
+                start_date=sample_start,
+                sequence_length=sequence_length, 
+                freq=freq,
+                list_exo_variables=list_exo_variables,
+                cosinbase=True,  # Use sin/cos encoding
+                new_range=(-1, 1)
+            )  # Shape: (1, n_features, sequence_length)
+            
+            all_exogenous.append(exo_features[0])  # Remove the first dimension
+        
+        # Stack all samples
+        exogenous_tensor = np.stack(all_exogenous, axis=0)  # (n_samples, n_features, sequence_length)
+        
+        return torch.tensor(exogenous_tensor, dtype=torch.float32)
+
+    def partial_fit(self, train_main, train_appliances, do_preprocessing=True,
+                   current_epoch=0, **load_kwargs):
+        """
+        Train NILMFormer model on a data chunk
+        """
+        
+        # Compute appliance parameters if not available
+        if not self.appliance_params:
+            self.set_appliance_params(train_appliances)
+
+        _log_print("...............NILMFormer partial_fit running...............")
+        
+        # Preprocess data
+        if do_preprocessing:
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+
+        # Prepare main power data
+        train_main = pd.concat(train_main, axis=0)
+        train_main_values = train_main.values.reshape((-1, self.sequence_length, 1))
+        
+        # Create exogenous temporal features using create_exogene (much better than random noise!)
+        n_samples = train_main_values.shape[0]
+        exogenous_features = self.create_exogene_features(n_samples, self.sequence_length)
+        
+        # Prepare input: concatenate main power with exogenous features
+        # Main power: (B, 1, L), Exogenous: (B, c_embedding, L)
+        train_main_tensor = torch.tensor(train_main_values.transpose(0, 2, 1), dtype=torch.float32)  # (B, 1, L)
+        train_input = torch.cat([train_main_tensor, exogenous_features], dim=1)  # (B, 1 + c_embedding, L)
+        
+        # Prepare appliance data
+        new_train_appliances = []
+        for app_name, app_df in train_appliances:
+            app_df = pd.concat(app_df, axis=0)
+            app_df_values = app_df.values.reshape((-1, self.sequence_length, 1))
+            app_df_tensor = torch.tensor(app_df_values, dtype=torch.float32)
+            new_train_appliances.append((app_name, app_df_tensor))
+        train_appliances = new_train_appliances
+
+        # Train models for each appliance
+        for appliance_name, power_tensor in train_appliances:
+            if appliance_name not in self.models:
+                _log_print(f"First model training for {appliance_name}")
+                self.models[appliance_name] = self.return_network()
+            else:
+                _log_print(f"Started Retraining model for {appliance_name}")
+
+            model = self.models[appliance_name]
+            
+            if train_input.size(0) > 10:
+                self.train_model(model, train_input, power_tensor, 
+                               appliance_name, current_epoch)
+
+    def train_model(self, model, train_input, power_tensor, appliance_name, current_epoch):
+        """Train a single appliance model with proper NILMFormer training protocol"""
+        
+        # Split data
+        n_total = train_input.size(0)
+        val_split = int(0.15 * n_total)
+        
+        indices = torch.randperm(n_total)
+        train_indices = indices[val_split:]
+        val_indices = indices[:val_split]
+        
+        train_input_split = train_input[train_indices].to(self.device)
+        train_power_split = power_tensor[train_indices].to(self.device)
+        
+        val_input_split = train_input[val_indices].to(self.device)
+        val_power_split = power_tensor[val_indices].to(self.device)
+        
+        # For NILMFormer, we predict the full sequence
+        # Target shape: (batch, sequence_length, 1) -> (batch, 1, sequence_length)
+        train_power_split = train_power_split.transpose(1, 2)  # (B, 1, L)
+        val_power_split = val_power_split.transpose(1, 2)  # (B, 1, L)
+        
+        # Create datasets and loaders
+        train_dataset = NILMDataset(train_input_split, train_power_split)
+        val_dataset = NILMDataset(val_input_split, val_power_split)
+        
+        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
+        val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False)
+        
+        # Setup optimizer with weight decay (important for transformers)
+        optimizer = optim.AdamW(
+            model.parameters(), 
+            lr=self.learning_rate,
+            weight_decay=0.01,  # Weight decay for regularization
+            betas=(0.9, 0.95)   # Optimized betas for transformers
+        )
+        
+        # Learning rate scheduler with warmup
+        total_steps = len(train_loader) * self.n_epochs
+        scheduler = optim.lr_scheduler.OneCycleLR(
+            optimizer,
+            max_lr=self.learning_rate,
+            total_steps=total_steps,
+            pct_start=0.1,  # 10% warmup
+            anneal_strategy='cos'
+        )
+        
+        criterion = nn.MSELoss()
+        best_val_loss = float('inf')
+        best_model_path = checkpoint_path(".pth")
+        patience = 10
+        patience_counter = 0
+        
+        _log_print(f"Training {appliance_name} with {total_steps} total steps using integrated exogenous features")
+        
+        # Training loop
+        for epoch in range(self.n_epochs):
+            model.train()
+            train_losses = []
+            
+            # Training phase
+            train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{self.n_epochs}")
+            for input_batch, power_batch in train_bar:
+                input_batch = input_batch.to(self.device)
+                power_batch = power_batch.to(self.device)
+                
+                optimizer.zero_grad()
+                # Forward pass without timestamps
+                predictions = model(input_batch)  # Shape: (B, c_out, L)
+                loss = criterion(predictions, power_batch)
+                loss.backward()
+                
+                # Gradient clipping (important for transformer stability)
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                
+                optimizer.step()
+                scheduler.step()
+                
+                train_losses.append(loss.item())
+                train_bar.set_postfix(loss=loss.item(), lr=scheduler.get_last_lr()[0])
+            
+            # Validation phase
+            model.eval()
+            val_losses = []
+            with torch.no_grad():
+                for input_batch, power_batch in val_loader:
+                    input_batch = input_batch.to(self.device)
+                    power_batch = power_batch.to(self.device)
+                    
+                    predictions = model(input_batch)
+                    loss = criterion(predictions, power_batch)
+                    val_losses.append(loss.item())
+            
+            avg_train_loss = np.mean(train_losses)
+            avg_val_loss = np.mean(val_losses)
+            
+            _log_print(f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.6f}, "
+                  f"Val Loss: {avg_val_loss:.6f}, LR: {scheduler.get_last_lr()[0]:.2e}")
+            
+            # Save best model and early stopping
+            if avg_val_loss < best_val_loss:
+                best_val_loss = avg_val_loss
+                torch.save(model.state_dict(), best_model_path)
+                _log_print(f"Saved best model for {appliance_name}")
+                patience_counter = 0
+            else:
+                patience_counter += 1
+                if patience_counter >= patience:
+                    _log_print(f"Early stopping triggered for {appliance_name}")
+                    break
+        
+        # Load best model
+        model.load_state_dict(torch.load(best_model_path))
+        model.eval()
+        _log_print(f"Training completed for {appliance_name}")
+
+    def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
+        """
+        Disaggregate power consumption for test data using NILMFormer
+        """
+        
+        if model is not None:
+            self.models = model
+
+        test_predictions = []
+        for test_mains_df in test_main_list:
+            disggregation_dict = {}
+            
+            # Store original length before any preprocessing
+            original_length = len(test_mains_df)
+            
+            if do_preprocessing:
+                # Use the standard preprocessing pipeline
+                processed_mains_list = self.call_preprocessing(
+                    [test_mains_df], submeters_lst=None, method='test')
+                processed_mains_df = processed_mains_list[0]
+                
+                # Convert preprocessed data to proper format
+                test_main_values = processed_mains_df.values  # Already shaped correctly
+                test_main_tensor = torch.tensor(
+                    test_main_values.reshape((-1, 1, self.sequence_length)), 
+                    dtype=torch.float32
+                )  # (N, 1, L)
+            else:
+                # Manual preprocessing if needed
+                test_main_values = test_mains_df.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                test_main_values = np.pad(
+                    test_main_values, (units_to_pad, units_to_pad),
+                    'constant', constant_values=(0, 0)
+                )
+                test_main_values = np.array([
+                    test_main_values[i:i + n] for i in range(len(test_main_values) - n + 1)
+                ])
+                test_main_values = (test_main_values - self.mains_mean) / self.mains_std
+                test_main_tensor = torch.tensor(
+                    test_main_values.reshape((-1, 1, self.sequence_length)),
+                    dtype=torch.float32
+                )
+            
+            # Create exogenous temporal features for test data
+            n_samples = test_main_tensor.shape[0]
+            test_exogenous = self.create_exogene_features(n_samples, self.sequence_length)
+            
+            # Prepare input: concatenate main power with exogenous features
+            test_input = torch.cat([test_main_tensor, test_exogenous], dim=1)  # (B, 1 + c_embedding, L)
+            test_input_tensor = test_input.to(self.device)
+
+            for appliance in self.models:
+                model = self.models[appliance]
+                model.eval()
+                
+                with torch.no_grad():
+                    # Process in batches to avoid memory issues
+                    predictions = []
+                    for i in range(0, len(test_input_tensor), self.batch_size):
+                        batch = test_input_tensor[i:i+self.batch_size]
+                        pred_batch = model(batch)  # Shape: (B, c_out, L)
+                        predictions.append(pred_batch.cpu().numpy())
+                    
+                    prediction = np.concatenate(predictions, axis=0)  # (N, c_out, L)
+
+                # Extract middle predictions for sequence-to-point conversion
+                middle_idx = self.sequence_length // 2
+                point_predictions = prediction[:, 0, middle_idx]  # (N,)
+                
+                # Reconstruct full sequence using correct overlapping window logic
+                padding = self.sequence_length // 2
+                reconstructed_length = original_length  # Use original length!
+                sum_arr = np.zeros(reconstructed_length + 2 * padding)
+                counts_arr = np.zeros(reconstructed_length + 2 * padding)
+                
+                # Place predictions at correct positions
+                for i, pred_value in enumerate(point_predictions):
+                    target_idx = i + padding  # Account for padding offset
+                    if target_idx < len(sum_arr):
+                        sum_arr[target_idx] += pred_value
+                        counts_arr[target_idx] += 1
+                
+                # Average overlapping predictions and extract original sequence
+                valid_mask = counts_arr > 0
+                final_prediction = np.zeros_like(sum_arr)
+                final_prediction[valid_mask] = sum_arr[valid_mask] / counts_arr[valid_mask]
+                
+                # Extract the original sequence (remove padding)
+                final_prediction = final_prediction[padding:padding + original_length]
+                
+                # Denormalize the predictions
+                if appliance in self.appliance_params:
+                    app_mean = self.appliance_params[appliance]['mean']
+                    app_std = self.appliance_params[appliance]['std']
+                    final_prediction = final_prediction * app_std + app_mean
+                
+                # Clip negative values
+                final_prediction_clipped = np.where(final_prediction > 0, final_prediction, 0)
+                df = pd.Series(final_prediction_clipped)
+                disggregation_dict[appliance] = df
+
+            results = pd.DataFrame(disggregation_dict, dtype='float32')
+            test_predictions.append(results)
+
+        return test_predictions
+
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """Preprocess data for training or testing"""
+        
+        if method == 'train':
+            # Training preprocessing
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(
+                    new_mains, (units_to_pad, units_to_pad),
+                    'constant', constant_values=(0, 0)
+                )
+                new_mains = np.array([
+                    new_mains[i:i + n] for i in range(len(new_mains) - n + 1)
+                ])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+
+            appliance_list = []
+            for app_index, (app_name, app_df_list) in enumerate(submeters_lst):
+                if app_name in self.appliance_params:
+                    app_mean = self.appliance_params[app_name]['mean']
+                    app_std = self.appliance_params[app_name]['std']
+                else:
+                    _log_print(self.appliance_params)
+                    _log_print(f"Parameters for {app_name} were not found!")
+                    raise ApplianceNotFoundError()
+
+                processed_appliance_dfs = []
+                for app_df in app_df_list:
+                    new_app_readings = app_df.values.flatten()
+                    n = self.sequence_length
+                    units_to_pad = n // 2
+                    new_app_readings = np.pad(
+                        new_app_readings, (units_to_pad, units_to_pad),
+                        'constant', constant_values=(0, 0)
+                    )
+                    new_app_readings = np.array([
+                        new_app_readings[i:i + n] for i in range(len(new_app_readings) - n + 1)
+                    ])
+                    new_app_readings = (new_app_readings - app_mean) / app_std
+                    processed_appliance_dfs.append(pd.DataFrame(new_app_readings))
+                
+                appliance_list.append((app_name, processed_appliance_dfs))
+            
+            return processed_mains_lst, appliance_list
+
+        else:
+            # Test preprocessing
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(
+                    new_mains, (units_to_pad, units_to_pad),
+                    'constant', constant_values=(0, 0)
+                )
+                new_mains = np.array([
+                    new_mains[i:i + n] for i in range(len(new_mains) - n + 1)
+                ])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                new_mains = new_mains.reshape((-1, self.sequence_length))
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            
+            return processed_mains_lst
+
+    def denormalize_output(self, predictions, appliance_name):
+        """Denormalize model predictions for a specific appliance"""
+        if appliance_name in self.appliance_params:
+            app_mean = self.appliance_params[appliance_name]['mean']
+            app_std = self.appliance_params[appliance_name]['std']
+            return predictions * app_std + app_mean
+        else:
+            return predictions
+
+    def set_appliance_params(self, train_appliances):
+        """Calculate normalization parameters for each appliance"""
+        
+        for (app_name, df_list) in train_appliances:
+            values = np.array(pd.concat(df_list, axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            if app_std < 1:
+                app_std = 100
+            self.appliance_params.update({
+                app_name: {'mean': app_mean, 'std': app_std}
+            })
+        
+        _log_print("Appliance parameters:", self.appliance_params)
diff --git a/nilmtk_contrib/torch/preprocessing.py b/nilmtk_contrib/torch/preprocessing.py
index b21a71e..d7cb8a0 100644
--- a/nilmtk_contrib/torch/preprocessing.py
+++ b/nilmtk_contrib/torch/preprocessing.py
@@ -2,24 +2,54 @@
 import pandas as pd
 
 class ApplianceNotFoundError(Exception):
+    """Custom exception for when appliance parameters are not found."""
     pass
 
-def preprocess(sequence_length = None,mains_mean = None,mains_std = None,mains_lst = None,submeters_lst = None,method="train",appliance_params=None,windowing=False):
+def preprocess(sequence_length=None, mains_mean=None, mains_std=None, mains_lst=None, submeters_lst=None, method="train", appliance_params=None, windowing=False):
+    """
+    Preprocesses mains and appliance data by creating sliding windows and normalizing the data.
+
+    Args:
+        sequence_length (int): The length of the sliding window.
+        mains_mean (float): The mean of the mains data for normalization.
+        mains_std (float): The standard deviation of the mains data for normalization.
+        mains_lst (list of pd.DataFrame): A list of DataFrames, each containing mains data.
+        submeters_lst (list of tuples): A list where each tuple contains the appliance name 
+                                        (str) and a list of its corresponding DataFrames.
+        method (str, optional): The mode of operation, either "train" or "test". Defaults to "train".
+        appliance_params (dict, optional): A dictionary containing the mean and std for each 
+                                           appliance. Required if method is "train". Defaults to None.
+        windowing (bool, optional): If True, applies sliding window to appliance data. 
+                                    If False, normalizes the flattened appliance data. Defaults to False.
+
+    Returns:
+        If method is "test" or submeters_lst is not provided:
+            list of pd.DataFrame: A list of preprocessed mains dataframes.
+        If method is "train":
+            tuple: A tuple containing:
+                - list of pd.DataFrame: Preprocessed mains data.
+                - list of tuples: Preprocessed appliance data, structured like submeters_lst.
+    """
     pad = sequence_length // 2
 
+    # Preprocess mains data
     proc_mains = []
-
     for mains in mains_lst:
         v = mains.values.flatten()
-        v = np.pad(v,(pad,pad))
-        windows = np.array([v[i:i+sequence_length] for i in range(len(v)-sequence_length + 1)],dtype=np.float32)
-        windows = (windows - mains_mean)/mains_std
+        # Pad the sequence to handle windowing at the edges
+        v = np.pad(v, (pad, pad), 'constant', constant_values=(0,0))
+        # Create sliding windows
+        windows = np.array([v[i:i+sequence_length] for i in range(len(v) - sequence_length + 1)], dtype=np.float32)
+        # Normalize the windows
+        windows = (windows - mains_mean) / mains_std
         proc_mains.append(pd.DataFrame(windows))
+
+    # Return only mains data if in test mode or no appliance data is provided
     if method == "test" or not submeters_lst:
         return proc_mains
     
+    # Preprocess appliance data
     proc_apps = []
-
     for app_name, df_list in submeters_lst:
         if appliance_params is None or app_name not in appliance_params:
             raise ApplianceNotFoundError(f"Parameters for {app_name} not initialized.")
@@ -28,19 +58,19 @@ def preprocess(sequence_length = None,mains_mean = None,mains_std = None,mains_l
         std = appliance_params[app_name]["std"]
 
         sub = []
-
         for df in df_list:
             flat = df.values.flatten()
 
-
             if windowing:
-                flat = np.pad(flat,(pad,pad))
-                windows = np.array([flat[i:i+sequence_length] for i in range(len(flat)-sequence_length+1)],dtype=np.float32)
-                windows = (windows-mean)/std
+                # Apply padding and sliding window if specified
+                flat = np.pad(flat, (pad, pad), 'constant', constant_values=(0,0))
+                windows = np.array([flat[i:i+sequence_length] for i in range(len(flat) - sequence_length + 1)], dtype=np.float32)
+                windows = (windows - mean) / std
                 sub.append(pd.DataFrame(windows))
             else:
-                flat = (flat-mean)/std
-                sub.append(pd.DataFrame(flat.reshape(-1,1)))
-        proc_apps.append((app_name,sub))
+                # Normalize the flattened data directly
+                flat = (flat - mean) / std
+                sub.append(pd.DataFrame(flat.reshape(-1, 1)))
+        proc_apps.append((app_name, sub))
     
     return proc_mains, proc_apps
\ No newline at end of file
diff --git a/nilmtk_contrib/torch/reformer.py b/nilmtk_contrib/torch/reformer.py
new file mode 100644
index 0000000..76e53d5
--- /dev/null
+++ b/nilmtk_contrib/torch/reformer.py
@@ -0,0 +1,578 @@
+from collections import OrderedDict
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import TensorDataset, DataLoader
+import math
+from nilmtk.disaggregate import Disaggregator
+
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
+class SequenceLengthError(Exception):
+    pass
+
+class ApplianceNotFoundError(Exception):
+    pass
+
+# Axial Positional Embeddings
+class AxialPositionalEmbedding(nn.Module):
+    """
+    Axial positional embeddings for long sequences.
+    """
+    def __init__(self, dim, max_seq_len, axial_shape):
+        super().__init__()
+        self.dim = dim
+        self.max_seq_len = max_seq_len
+        self.axial_shape = axial_shape
+        
+        assert len(axial_shape) == 2, "Axial shape must be 2D"
+        assert axial_shape[0] * axial_shape[1] == max_seq_len, "Axial shape must multiply to max_seq_len"
+        
+        self.axial_dims = [dim // 2, dim - (dim // 2)]
+        
+        self.pos_embs = nn.ModuleList([
+            nn.Embedding(axial_shape[0], self.axial_dims[0]),
+            nn.Embedding(axial_shape[1], self.axial_dims[1])
+        ])
+    
+    def forward(self, x):
+        b, n, d = x.shape
+        embs = []
+        
+        for i, (shape, pos_emb) in enumerate(zip(self.axial_shape, self.pos_embs)):
+            if i == 0:
+                pos = torch.arange(n, device=x.device) // self.axial_shape[1]
+            else:
+                pos = torch.arange(n, device=x.device) % self.axial_shape[1]
+            
+            emb = pos_emb(pos)
+            embs.append(emb)
+        
+        pos_emb = torch.cat(embs, dim=-1)
+        return x + pos_emb
+
+# LSH Attention Implementation
+class LSHSelfAttention(nn.Module):
+    """
+    LSH self-attention for efficient attention computation.
+    """
+    def __init__(self, dim, heads=8, bucket_size=64, n_hashes=4, causal=False, dropout=0.):
+        super().__init__()
+        self.dim = dim
+        self.heads = heads
+        self.bucket_size = bucket_size
+        self.n_hashes = n_hashes
+        self.causal = causal
+        self.dropout = nn.Dropout(dropout)
+        
+        self.head_dim = dim // heads
+        
+        self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
+        self.to_out = nn.Linear(dim, dim)
+        
+        # LSH parameters
+        self.hash_fn = nn.Linear(self.head_dim, n_hashes * bucket_size, bias=False)
+        
+    def hash_vectors(self, vecs):
+        # Simple LSH using random projections
+        batch_size, seq_len, dim = vecs.shape
+        
+        # Apply hash function
+        hash_codes = self.hash_fn(vecs)  # (b, n, n_hashes * bucket_size)
+        hash_codes = hash_codes.view(batch_size, seq_len, self.n_hashes, self.bucket_size)
+        
+        # Get bucket assignments
+        bucket_assignments = torch.argmax(hash_codes, dim=-1)  # (b, n, n_hashes)
+        
+        return bucket_assignments
+    
+    def forward(self, x, mask=None):
+        b, n, d = x.shape
+        h = self.heads
+        
+        # Generate Q, K, V
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: t.view(b, n, h, -1).transpose(1, 2), qkv)
+        
+        # For simplicity, we'll use standard attention with some bucketing
+        # In a full LSH implementation, this would involve more complex hashing
+        
+        # Scale queries
+        q = q * (self.head_dim ** -0.5)
+        
+        # Compute attention scores
+        scores = torch.einsum('bhid,bhjd->bhij', q, k)
+        
+        # Apply causal mask if needed
+        if self.causal:
+            causal_mask = torch.tril(torch.ones(n, n, device=x.device, dtype=torch.bool))
+            scores = scores.masked_fill(~causal_mask, float('-inf'))
+        
+        # Apply input mask if provided
+        if mask is not None:
+            scores = scores.masked_fill(~mask[:, None, None, :], float('-inf'))
+        
+        # Softmax
+        attn = F.softmax(scores, dim=-1)
+        attn = self.dropout(attn)
+        
+        # Apply attention to values
+        out = torch.einsum('bhij,bhjd->bhid', attn, v)
+        out = out.transpose(1, 2).contiguous().view(b, n, d)
+        
+        return self.to_out(out)
+
+# Chunk FeedForward Layer
+class ChunkFeedForward(nn.Module):
+    """
+    A feed-forward layer that processes inputs in chunks to save memory.
+    """
+    def __init__(self, dim, mult=4, chunks=1, dropout=0.):
+        super().__init__()
+        self.chunks = chunks
+        self.dim = dim
+        hidden_dim = int(dim * mult)
+        
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout)
+        )
+    
+    def forward(self, x):
+        if self.chunks == 1:
+            return self.net(x)
+        
+        # Process in chunks to save memory
+        chunks = x.chunk(self.chunks, dim=1)
+        return torch.cat([self.net(c) for c in chunks], dim=1)
+
+# Reformer Block
+class ReformerBlock(nn.Module):
+    """
+    A single block of the Reformer model, combining LSH attention and a feed-forward network.
+    """
+    def __init__(self, dim, heads=8, bucket_size=64, n_hashes=4, ff_mult=4, 
+                 ff_chunks=1, causal=False, dropout=0.):
+        super().__init__()
+        
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = LSHSelfAttention(
+            dim=dim,
+            heads=heads,
+            bucket_size=bucket_size,
+            n_hashes=n_hashes,
+            causal=causal,
+            dropout=dropout
+        )
+        
+        self.norm2 = nn.LayerNorm(dim)
+        self.ff = ChunkFeedForward(
+            dim=dim,
+            mult=ff_mult,
+            chunks=ff_chunks,
+            dropout=dropout
+        )
+    
+    def forward(self, x, mask=None):
+        # Pre-norm architecture
+        x = x + self.attn(self.norm1(x), mask=mask)
+        x = x + self.ff(self.norm2(x))
+        return x
+
+# Main Reformer Network for NILM
+class ReformerNet(nn.Module):
+    """
+    The Reformer network architecture for NILM.
+    """
+    def __init__(self, sequence_length, dim=512, depth=6, heads=8, bucket_size=64, 
+                 n_hashes=4, ff_mult=4, ff_chunks=1, dropout=0.1, 
+                 axial_position_emb=True, axial_position_shape=None):
+        super().__init__()
+        
+        self.sequence_length = sequence_length
+        self.dim = dim
+        
+        # Input projection
+        self.input_projection = nn.Linear(1, dim)
+        
+        # Positional embeddings
+        if axial_position_emb:
+            if axial_position_shape is None:
+                # Auto-determine axial shape
+                sqrt_seq = int(math.sqrt(sequence_length))
+                while sequence_length % sqrt_seq != 0:
+                    sqrt_seq -= 1
+                axial_position_shape = (sqrt_seq, sequence_length // sqrt_seq)
+            
+            self.pos_emb = AxialPositionalEmbedding(
+                dim=dim,
+                max_seq_len=sequence_length,
+                axial_shape=axial_position_shape
+            )
+        else:
+            self.pos_emb = nn.Parameter(torch.randn(1, sequence_length, dim))
+        
+        # Reformer blocks
+        self.blocks = nn.ModuleList([
+            ReformerBlock(
+                dim=dim,
+                heads=heads,
+                bucket_size=bucket_size,
+                n_hashes=n_hashes,
+                ff_mult=ff_mult,
+                ff_chunks=ff_chunks,
+                causal=False,  # For NILM, we can use full attention
+                dropout=dropout
+            ) for _ in range(depth)
+        ])
+        
+        # Output layers
+        self.norm = nn.LayerNorm(dim)
+        self.to_out = nn.Sequential(
+            nn.Linear(dim, 1024),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(1024, 1)
+        )
+        
+        self._initialize_weights()
+    
+    def _initialize_weights(self):
+        """
+        Initializes the model weights.
+        """
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+    
+    def forward(self, x):
+        # x shape: (batch_size, 1, sequence_length)
+        # Transpose to (batch_size, sequence_length, 1)
+        x = x.transpose(1, 2)
+        
+        # Project to model dimension
+        x = self.input_projection(x)  # (batch_size, sequence_length, dim)
+        
+        # Add positional embeddings
+        if isinstance(self.pos_emb, AxialPositionalEmbedding):
+            x = self.pos_emb(x)
+        else:
+            x = x + self.pos_emb
+        
+        # Apply Reformer blocks
+        for block in self.blocks:
+            x = block(x)
+        
+        # Final normalization
+        x = self.norm(x)
+        
+        # Global average pooling
+        x = x.mean(dim=1)  # (batch_size, dim)
+        
+        # Output projection
+        x = self.to_out(x)  # (batch_size, 1)
+        
+        return x
+
+class Reformer(Disaggregator):
+    """
+    Reformer model for non-intrusive load monitoring.
+    
+    This implementation is based on the paper:
+    "Reformer: The Efficient Transformer"
+    https://arxiv.org/abs/2001.04451
+    
+    The model adapts the Reformer architecture for energy disaggregation tasks,
+    using locality-sensitive hashing (LSH) attention and reversible layers for
+    memory-efficient processing of long sequences.
+    
+    Architecture Overview:
+    - LSH self-attention for efficient attention computation
+    - Axial positional embeddings for long sequences
+    - Chunk feed-forward layers for memory efficiency
+    - Reversible residual connections (conceptually)
+    - Sequence-to-point prediction for energy disaggregation
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Length of input sequences (default: 99)
+            - dim (int): Model dimension (default: 512)
+            - depth (int): Number of transformer layers (default: 6)
+            - heads (int): Number of attention heads (default: 8)
+            - bucket_size (int): LSH bucket size (default: 64)
+            - n_hashes (int): Number of LSH hash functions (default: 4)
+            - ff_mult (int): Feed-forward expansion factor (default: 4)
+            - ff_chunks (int): Number of chunks for feed-forward (default: 1)
+            - dropout (float): Dropout rate (default: 0.1)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+    """
+    def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        super().__init__()
+        self.MODEL_NAME = "Reformer"
+        self.models = OrderedDict()
+        self.file_prefix = f"{self.MODEL_NAME.lower()}-temp-weights"
+        
+        # Extract hyperparameters from params dict
+        self.chunk_wise_training = params.get("chunk_wise_training", False)
+        self.sequence_length = params.get("sequence_length", 99)
+        self.n_epochs = params.get("n_epochs", 10)
+        self.batch_size = params.get("batch_size", 512)
+        self.appliance_params = params.get("appliance_params", {})
+        self.mains_mean = params.get("mains_mean", 1800)
+        self.mains_std = params.get("mains_std", 600)
+        
+        # Reformer specific parameters
+        self.dim = params.get("dim", 512)
+        self.depth = params.get("depth", 6)
+        self.heads = params.get("heads", 8)
+        self.bucket_size = params.get("bucket_size", 64)
+        self.n_hashes = params.get("n_hashes", 4)
+        self.ff_mult = params.get("ff_mult", 4)
+        self.ff_chunks = params.get("ff_chunks", 1)
+        self.dropout = params.get("dropout", 0.1)
+        self.axial_position_emb = params.get("axial_position_emb", True)
+        self.axial_position_shape = params.get("axial_position_shape", None)
+        
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        
+        # Sequence length must be odd for proper windowing
+        if self.sequence_length % 2 == 0:
+            _log_print("Sequence length should be odd!")
+            raise SequenceLengthError
+        
+        _log_print(f"Reformer initialized with sequence_length={self.sequence_length}")
+        _log_print(f"Reformer params: dim={self.dim}, depth={self.depth}, heads={self.heads}")
+        _log_print(f"LSH params: bucket_size={self.bucket_size}, n_hashes={self.n_hashes}")
+        _log_print(f"Using device: {self.device}")
+
+    def return_network(self):
+        """
+        Builds the Reformer network.
+        """
+        model = ReformerNet(
+            sequence_length=self.sequence_length,
+            dim=self.dim,
+            depth=self.depth,
+            heads=self.heads,
+            bucket_size=self.bucket_size,
+            n_hashes=self.n_hashes,
+            ff_mult=self.ff_mult,
+            ff_chunks=self.ff_chunks,
+            dropout=self.dropout,
+            axial_position_emb=self.axial_position_emb,
+            axial_position_shape=self.axial_position_shape
+        ).to(self.device)
+        
+        # Count parameters
+        total_params = sum(p.numel() for p in model.parameters())
+        _log_print(f"Reformer model created with {total_params:,} parameters")
+        
+        return model
+
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """
+        Preprocesses data using a sliding window, matching seq2point.
+        """
+        if method == 'train':
+            # Preprocessing for the train data follows the Seq2Point-style path.
+            mains_df_list = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                mains_df_list.append(pd.DataFrame(new_mains))
+
+            appliance_list = []
+            for app_index, (app_name, app_df_list) in enumerate(submeters_lst):
+                if app_name in self.appliance_params:
+                    app_mean = self.appliance_params[app_name]['mean']
+                    app_std = self.appliance_params[app_name]['std']
+                else:
+                    _log_print("Parameters for", app_name, "were not found!")
+                    raise ApplianceNotFoundError()
+
+                processed_appliance_dfs = []
+                for app_df in app_df_list:
+                    new_app_readings = app_df.values.reshape((-1, 1))
+                    # This is for choosing windows
+                    new_app_readings = (new_app_readings - app_mean) / app_std  
+                    # Return as a list of dataframe
+                    processed_appliance_dfs.append(pd.DataFrame(new_app_readings))
+                appliance_list.append((app_name, processed_appliance_dfs))
+            return mains_df_list, appliance_list
+        
+        else:
+            # Preprocessing for the test data follows the Seq2Point-style path.
+            mains_df_list = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                mains_df_list.append(pd.DataFrame(new_mains))
+            return mains_df_list
+
+    def set_appliance_params(self, train_appliances):
+        """
+        Computes and sets normalization parameters for each appliance.
+        """
+        for app_name, df_list in train_appliances:
+            values = np.array(pd.concat(df_list, axis=0))
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            if app_std < 1:
+                app_std = 100
+            self.appliance_params.update({app_name: {'mean': app_mean, 'std': app_std}})
+        _log_print(self.appliance_params)
+
+    def partial_fit(self, train_main, train_appliances, do_preprocessing=True, current_epoch=0, **load_kwargs):
+        """
+        Trains the Reformer model on a chunk of data.
+        """
+        # If no appliance wise parameters are provided, then compute them using the first chunk
+        if len(self.appliance_params) == 0:
+            self.set_appliance_params(train_appliances)
+
+        _log_print("...............Reformer partial_fit running...............")
+        # Do the pre-processing, such as windowing and normalizing
+        if do_preprocessing:
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+
+        train_main = pd.concat(train_main, axis=0)
+        train_main = train_main.values.reshape((-1, self.sequence_length, 1))
+        new_train_appliances = []
+        for app_name, app_df in train_appliances:
+            app_df = pd.concat(app_df, axis=0)
+            app_df_values = app_df.values.reshape((-1, 1))
+            new_train_appliances.append((app_name, app_df_values))
+        train_appliances = new_train_appliances
+
+        for appliance_name, power in train_appliances:
+            # Check if the appliance was already trained. If not then create a new model for it
+            if appliance_name not in self.models:
+                _log_print("First model training for", appliance_name)
+                self.models[appliance_name] = self.return_network()
+            # Retrain the particular appliance
+            else:
+                _log_print("Started Retraining model for", appliance_name)
+
+            model = self.models[appliance_name]
+            if train_main.size > 0:
+                # Sometimes chunks can be empty after dropping NANS
+                if len(train_main) > 10:
+                    # Convert to PyTorch tensors and correct format
+                    # PyTorch Conv1d expects (batch, channels, length)
+                    train_main_tensor = torch.tensor(train_main, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+                    power_tensor = torch.tensor(power, dtype=torch.float32).squeeze().to(self.device)
+                    
+                    # Create validation split
+                    n_samples = train_main_tensor.size(0)
+                    val_size = max(1, int(0.15 * n_samples)) if n_samples > 1 else 0
+                    indices = torch.randperm(n_samples)
+                    train_idx, val_idx = indices[val_size:], indices[:val_size]
+                    
+                    train_X = train_main_tensor[train_idx]
+                    train_y = power_tensor[train_idx]
+                    val_X = train_main_tensor[val_idx]
+                    val_y = power_tensor[val_idx]
+                    
+                    # Setup optimizer and loss
+                    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, betas=(0.9, 0.999), eps=1e-07, weight_decay=0.0)
+                    criterion = nn.MSELoss()
+                    
+                    best_val_loss = float('inf')
+                    filepath = checkpoint_path(".pth")
+                    
+                    # Training loop matching seq2point behavior
+                    for epoch in range(self.n_epochs):
+                        model.train()
+                        
+                        # Create batches
+                        train_dataset = TensorDataset(train_X, train_y)
+                        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
+                        
+                        epoch_losses = []
+                        for batch_X, batch_y in train_loader:
+                            optimizer.zero_grad()
+                            predictions = model(batch_X).squeeze()
+                            loss = criterion(predictions, batch_y)
+                            loss.backward()
+                            
+                            # Add gradient clipping like seq2point
+                            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                            
+                            optimizer.step()
+                            epoch_losses.append(loss.item())
+                        
+                        # Validation
+                        model.eval()
+                        with torch.no_grad():
+                            val_predictions = model(val_X).squeeze()
+                            val_loss = criterion(val_predictions, val_y).item()
+                        
+                        avg_train_loss = np.mean(epoch_losses)
+                        _log_print(f"Epoch {epoch+1}/{self.n_epochs} - loss: {avg_train_loss:.4f} - val_loss: {val_loss:.4f}")
+                        
+                        # Save best model (matching seq2point's ModelCheckpoint behavior)
+                        if val_loss < best_val_loss:
+                            best_val_loss = val_loss
+                            torch.save(model.state_dict(), filepath)
+                            _log_print(f"Validation loss improved, saving model to {filepath}")
+                    
+                    # Load best weights
+                    model.load_state_dict(torch.load(filepath, map_location=self.device))
+
+    def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
+        """
+        Disaggregates a chunk of mains power data.
+        """
+        if model is not None:
+            self.models = model
+
+        # Preprocess the test mains such as windowing and normalizing
+        if do_preprocessing:
+            test_main_list = self.call_preprocessing(test_main_list, submeters_lst=None, method='test')
+
+        test_predictions = []
+        for test_main in test_main_list:
+            test_main = test_main.values
+            test_main = test_main.reshape((-1, self.sequence_length, 1))
+            
+            # Convert to PyTorch tensor with correct format for Conv1d
+            test_main_tensor = torch.tensor(test_main, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+            
+            disggregation_dict = {}
+            for appliance in self.models:
+                model = self.models[appliance]
+                model.eval()
+                with torch.no_grad():
+                    prediction = model(test_main_tensor).cpu().numpy()
+                    # Denormalize with the Seq2Point-style appliance parameters.
+                    prediction = self.appliance_params[appliance]['mean'] + prediction * self.appliance_params[appliance]['std']
+                    valid_predictions = prediction.flatten()
+                    valid_predictions = np.where(valid_predictions > 0, valid_predictions, 0)
+                    df = pd.Series(valid_predictions)
+                    disggregation_dict[appliance] = df
+            results = pd.DataFrame(disggregation_dict, dtype='float32')
+            test_predictions.append(results)
+        return test_predictions
diff --git a/nilmtk_contrib/torch/resnet.py b/nilmtk_contrib/torch/resnet.py
index b1f6b3e..6d00500 100644
--- a/nilmtk_contrib/torch/resnet.py
+++ b/nilmtk_contrib/torch/resnet.py
@@ -1,32 +1,21 @@
 from __future__ import print_function, division
-from warnings import warn
 
 from nilmtk.disaggregate import Disaggregator
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from torch.utils.data import Dataset, DataLoader, TensorDataset
-import os
+from torch.utils.data import DataLoader, TensorDataset
 import pandas as pd
 import numpy as np
-import pickle
 from collections import OrderedDict
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
-from tqdm import tqdm
-import random
-from nilmtk_contrib.torch.preprocessing import preprocess
-
-# Set random seeds
-random.seed(10)
-np.random.seed(10)
-torch.manual_seed(10)
-if torch.cuda.is_available():
-    torch.cuda.manual_seed(10)
-    torch.cuda.manual_seed_all(10)
+from nilmtk_contrib.utils.validation import safe_train_test_split as train_test_split
 
 # Set device
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 class SequenceLengthError(Exception):
@@ -36,112 +25,95 @@ class ApplianceNotFoundError(Exception):
     pass
 
 class IdentityBlock(nn.Module):
-    def __init__(self, filters, kernel_size, input_channels=None):
+    """
+    An identity block for ResNet, where the input and output dimensions are the same.
+    This implementation mirrors the structure of the original TensorFlow version.
+    """
+    def __init__(self, filters, kernel_size):
         super(IdentityBlock, self).__init__()
         
-        # Use input_channels if provided, otherwise assume filters[0]
-        in_channels = input_channels if input_channels is not None else filters[0]
-        
-        self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=filters[0], 
-                              kernel_size=kernel_size, stride=1, padding=kernel_size//2)
+        # Three convolutional layers, maintaining the channel count
+        self.conv1 = nn.Conv1d(in_channels=filters[0], out_channels=filters[0], 
+                              kernel_size=kernel_size, stride=1, padding='same')
         self.conv2 = nn.Conv1d(in_channels=filters[0], out_channels=filters[1], 
-                              kernel_size=kernel_size, stride=1, padding=kernel_size//2)
+                              kernel_size=kernel_size, stride=1, padding='same')
         self.conv3 = nn.Conv1d(in_channels=filters[1], out_channels=filters[2], 
-                              kernel_size=kernel_size, stride=1, padding=kernel_size//2)
-        
-        # Shortcut connection - adjust if input and output channels don't match
-        if in_channels != filters[2]:
-            self.shortcut = nn.Conv1d(in_channels=in_channels, out_channels=filters[2], 
-                                    kernel_size=1, stride=1, padding=0)
-        else:
-            self.shortcut = nn.Identity()
+                              kernel_size=kernel_size, stride=1, padding='same')
     
     def forward(self, x):
+        # Store input for the residual connection
         identity = x
         
+        # Forward pass through convolutions with ReLU activations
         out = F.relu(self.conv1(x))
         out = F.relu(self.conv2(out))
         out = self.conv3(out)
         
-        identity = self.shortcut(identity)
-        
-        # Ensure both tensors have the same size
-        if out.size() != identity.size():
-            # Adjust size if needed
-            min_size = min(out.size(2), identity.size(2))
-            out = out[:, :, :min_size]
-            identity = identity[:, :, :min_size]
-        
-        out = out + identity
+        # Add the residual (identity) connection and apply final activation
+        out += identity
         out = F.relu(out)
         
         return out
 
 class ConvolutionBlock(nn.Module):
-    def __init__(self, filters, kernel_size, input_channels=None):
+    """
+    A convolutional block for ResNet that can change the input's channel dimension.
+    This implementation mirrors the structure of the original TensorFlow version.
+    """
+    def __init__(self, filters, kernel_size):
         super(ConvolutionBlock, self).__init__()
         
-        # Use input_channels if provided, otherwise assume filters[0]
-        in_channels = input_channels if input_channels is not None else filters[0]
-        
-        self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=filters[0], 
-                              kernel_size=kernel_size, stride=1, padding=kernel_size//2)
+        # Main path with three convolutional layers
+        self.conv1 = nn.Conv1d(in_channels=filters[0], out_channels=filters[0], 
+                              kernel_size=kernel_size, stride=1, padding='same')
         self.conv2 = nn.Conv1d(in_channels=filters[0], out_channels=filters[1], 
-                              kernel_size=kernel_size, stride=1, padding=kernel_size//2)
+                              kernel_size=kernel_size, stride=1, padding='same')
         self.conv3 = nn.Conv1d(in_channels=filters[1], out_channels=filters[2], 
-                              kernel_size=kernel_size, stride=1, padding=kernel_size//2)
-        self.conv4 = nn.Conv1d(in_channels=in_channels, out_channels=filters[2], 
-                              kernel_size=kernel_size, stride=1, padding=kernel_size//2)
+                              kernel_size=kernel_size, stride=1, padding='same')
+        
+        # Skip connection path to match the output channel dimension
+        self.conv4 = nn.Conv1d(in_channels=filters[0], out_channels=filters[2], 
+                              kernel_size=kernel_size, stride=1, padding='same')
     
     def forward(self, x):
+        # Store input for the skip connection
         identity = x
         
+        # Forward pass through the main path
         out = F.relu(self.conv1(x))
         out = F.relu(self.conv2(out))
-        out = F.relu(self.conv3(out))
-        
-        identity = F.relu(self.conv4(identity))
+        out = self.conv3(out)
         
-        # Ensure both tensors have the same size
-        if out.size() != identity.size():
-            min_size = min(out.size(2), identity.size(2))
-            out = out[:, :, :min_size]
-            identity = identity[:, :, :min_size]
+        # Transform the identity to match the output channels for the residual connection
+        identity = self.conv4(identity)
         
-        out = out + identity
+        # Add the residual connection and apply final activation
+        out += identity
         out = F.relu(out)
         
         return out
 
 class ResNetModel(nn.Module):
     """
-    ResNet model for appliance load disaggregation.
-    It includes initial convolutional layers, ResNet blocks, and fully connected layers.
+    A ResNet-based model for NILM, mirroring the original TensorFlow implementation.
     """
     def __init__(self, sequence_length, num_filters=30):
         super(ResNetModel, self).__init__()
         self.sequence_length = sequence_length
         self.num_filters = num_filters
         
-        # Initial layers - matching TensorFlow implementation exactly
+        # Initial layers, including double ReLU to match TensorFlow's structure
         self.zero_pad = nn.ZeroPad1d(3)
-        self.conv1 = nn.Conv1d(in_channels=1, out_channels=num_filters, 
-                              kernel_size=48, stride=2, padding=0)  # No padding here, ZeroPad1d handles it
+        self.conv1 = nn.Conv1d(in_channels=1, out_channels=num_filters, kernel_size=48, stride=2)
         self.bn1 = nn.BatchNorm1d(num_filters)
-        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2, padding=0)
+        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2)
         
-        # Calculate intermediate size after initial layers
-        self._calculate_intermediate_size()
-        
-        # ResNet blocks with proper input channel specification
-        self.conv_block = ConvolutionBlock([num_filters, num_filters, num_filters], 24, 
-                                         input_channels=num_filters)
-        self.identity_block1 = IdentityBlock([num_filters, num_filters, num_filters], 12,
-                                           input_channels=num_filters)
-        self.identity_block2 = IdentityBlock([num_filters, num_filters, num_filters], 6,
-                                           input_channels=num_filters)
+        # ResNet blocks
+        self.conv_block = ConvolutionBlock([num_filters, num_filters, num_filters], 24)
+        self.identity_block1 = IdentityBlock([num_filters, num_filters, num_filters], 12)
+        self.identity_block2 = IdentityBlock([num_filters, num_filters, num_filters], 6)
         
-        # Calculate the size after convolutions for fully connected layers
+        # Calculate the input size for the fully connected layers dynamically
         self._calculate_fc_input_size()
         
         # Fully connected layers
@@ -149,29 +121,17 @@ def __init__(self, sequence_length, num_filters=30):
         self.dropout = nn.Dropout(0.2)
         self.fc2 = nn.Linear(1024, sequence_length)
     
-    def _calculate_intermediate_size(self):
-        """Calculate size after initial conv and maxpool layers"""
-        # Start with sequence_length + 6 (3 padding on each side)
-        size = self.sequence_length + 6
-        # After conv1 with kernel=48, stride=2
-        size = (size - 48) // 2 + 1
-        # After maxpool with kernel=3, stride=2  
-        size = (size - 3) // 2 + 1
-        self.intermediate_size = size
-    
     def _calculate_fc_input_size(self):
-        """Calculate the size after all convolutions"""
-        # Create a dummy input to calculate the size after convolutions
-        dummy_input = torch.zeros(1, 1, self.sequence_length)
-        x = self._forward_conv_layers(dummy_input)
-        x = x.view(x.size(0), -1)
-        self.fc_input_size = x.size(1)
+        """Calculates the input size for the FC layers via a dummy forward pass."""
+        with torch.no_grad():
+            dummy_input = torch.zeros(1, 1, self.sequence_length)
+            x = self._forward_conv_layers(dummy_input)
+            self.fc_input_size = x.flatten(1).shape[1]
     
     def _forward_conv_layers(self, x):
-        """Forward pass through convolutional layers only"""
-        # Initial processing
+        """Performs the forward pass through the convolutional layers."""
         x = self.zero_pad(x)
-        x = self.conv1(x)
+        x = F.relu(self.conv1(x))
         x = self.bn1(x)
         x = F.relu(x)
         x = self.maxpool(x)
@@ -188,7 +148,7 @@ def forward(self, x):
         x = self._forward_conv_layers(x)
         
         # Fully connected layers
-        x = x.view(x.size(0), -1)  # Flatten
+        x = x.flatten(1)
         x = F.relu(self.fc1(x))
         x = self.dropout(x)
         x = self.fc2(x)
@@ -197,11 +157,34 @@ def forward(self, x):
 
 class ResNet(Disaggregator):
     """
-    ResNet-based disaggregator for NILMTK.
-    This class implements a ResNet model for disaggregating mains electricity data
-    into appliance-level data.
-    """ 
+    ResNet-based model for non-intrusive load monitoring.
+    
+    This implementation is based on the paper:
+    "Deep Residual Learning for Image Recognition"
+    https://arxiv.org/abs/1512.03385
+    
+    The model adapts the ResNet architecture for energy disaggregation tasks,
+    using residual connections to enable training of deep networks for predicting
+    individual appliance power consumption from aggregate household power measurements.
+    
+    Architecture Overview:
+    - 1D convolutional layers adapted for time series data
+    - Identity blocks with residual connections for feature learning
+    - Convolution blocks for changing channel dimensions
+    - Batch normalization and max pooling for regularization
+    - Fully connected layers for sequence prediction
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Length of input sequences (default: 299)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
+            - appliance_params (dict): Appliance-specific normalization parameters
+            - load_model_path (str): Path to load pre-trained models
+    """
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
         self.MODEL_NAME = "ResNet"
         self.chunk_wise_training = params.get('chunk_wise_training', False)
         self.sequence_length = params.get('sequence_length', 299)
@@ -215,212 +198,227 @@ def __init__(self, params):
         self.device = device
         
         if self.sequence_length % 2 == 0:
-            print("Sequence length should be odd!")
-            raise SequenceLengthError
+            raise SequenceLengthError("Sequence length must be odd!")
     
     def partial_fit(self, train_main, train_appliances, do_preprocessing=True, **load_kwargs):
-        print("...............ResNet partial_fit running...............")
+        """Trains the model on a chunk of data."""
+        _log_print("...............ResNet partial_fit running...............")
         
-        if len(self.appliance_params) == 0:
+        if not self.appliance_params:
             self.set_appliance_params(train_appliances)
         
         if do_preprocessing:
-            print("Preprocessing data...")
-            train_main, train_appliances = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=train_main,
-                submeters_lst=train_appliances,
-                method="train",
-                appliance_params=self.appliance_params,
-                windowing=True
-            )
-        
-        train_main = pd.concat(train_main, axis=0)
-        train_main = train_main.values.reshape((-1, self.sequence_length, 1))
+            _log_print("Preprocessing data...")
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+        
+        train_main = pd.concat(train_main, axis=0).values.reshape((-1, self.sequence_length, 1))
         
         new_train_appliances = []
         for app_name, app_dfs in train_appliances:
-            app_df = pd.concat(app_dfs, axis=0)
-            app_df_values = app_df.values.reshape((-1, self.sequence_length))
+            app_df_values = pd.concat(app_dfs, axis=0).values.reshape((-1, self.sequence_length))
             new_train_appliances.append((app_name, app_df_values))
         train_appliances = new_train_appliances
         
-        print(f"Training data shape: {train_main.shape}")
-        
-        # Progress bar for appliances
-        appliance_progress = tqdm(train_appliances, desc="Training appliances", unit="appliance")
+        _log_print(f"Training data shape: {train_main.shape}")
         
-        for appliance_name, power in appliance_progress:
-            appliance_progress.set_postfix({"Current": appliance_name})
-            
+        for appliance_name, power in train_appliances:
             if appliance_name not in self.models:
-                print(f"\nFirst model training for {appliance_name}")
+                _log_print(f"First time training for {appliance_name}")
                 self.models[appliance_name] = self.return_network()
             else:
-                print(f"\nStarted Retraining model for {appliance_name}")
+                _log_print(f"Retraining model for {appliance_name}")
             
             model = self.models[appliance_name]
-            if train_main.size > 0:
-                if len(train_main) > 10:
-                    # Convert to PyTorch tensors
+            if train_main.size > 10:
+                    # Create training and validation sets
                     train_x, v_x, train_y, v_y = train_test_split(
-                        train_main, power, test_size=.15, random_state=10)
+                        train_main, power, test_size=0.15, random_state=10)
                     
+                    # Convert to PyTorch Tensors
                     train_x = torch.FloatTensor(train_x).permute(0, 2, 1).to(self.device)
                     v_x = torch.FloatTensor(v_x).permute(0, 2, 1).to(self.device)
                     train_y = torch.FloatTensor(train_y).to(self.device)
                     v_y = torch.FloatTensor(v_y).to(self.device)
                     
-                    # Create DataLoaders
+                    # Create DataLoaders for batching
                     train_dataset = TensorDataset(train_x, train_y)
                     val_dataset = TensorDataset(v_x, v_y)
                     train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
                     val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False)
                     
-                    # Training loop
+                    # Train the model
                     self.train_model(model, train_loader, val_loader, appliance_name)
     
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """
+        Preprocesses data by windowing and normalizing, mirroring the original
+        TensorFlow implementation.
+        """
+        if method == 'train':            
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            
+            appliance_list = []
+            for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
+                if app_name in self.appliance_params:
+                    app_mean = self.appliance_params[app_name]['mean']
+                    app_std = self.appliance_params[app_name]['std']
+                    self.appliance_params[app_name]['min']
+                    self.appliance_params[app_name]['max']
+                else:
+                    raise ApplianceNotFoundError(f"Parameters for appliance '{app_name}' not found!")
+
+                processed_app_dfs = []
+                for app_df in app_df_lst:                    
+                    new_app_readings = app_df.values.flatten()
+                    new_app_readings = np.pad(new_app_readings, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                    new_app_readings = np.array([new_app_readings[i:i + n] for i in range(len(new_app_readings) - n + 1)])                    
+                    new_app_readings = (new_app_readings - app_mean) / app_std
+                    processed_app_dfs.append(pd.DataFrame(new_app_readings))
+                    
+                appliance_list.append((app_name, processed_app_dfs))
+
+            return processed_mains_lst, appliance_list
+
+        else: # method == 'test'
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                new_mains = new_mains.reshape((-1, self.sequence_length))
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            return processed_mains_lst
+    
     def train_model(self, model, train_loader, val_loader, appliance_name):
-        optimizer = optim.Adam(model.parameters())
+        """Handles the training and validation loop for the model."""
+        # Optimizer with settings matching TensorFlow's defaults
+        optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-07)
         criterion = nn.MSELoss()
         
         best_val_loss = float('inf')
         best_model_state = None
+        patience = 10
+        patience_counter = 0
         
-        # Progress bar for epochs
-        epoch_progress = tqdm(range(self.n_epochs), desc=f"Training {appliance_name}", unit="epoch")
+        _log_print(f"Training {appliance_name} for {self.n_epochs} epochs...")
         
-        for epoch in epoch_progress:
-            # Training phase
+        for epoch in range(self.n_epochs):
+            # --- Training Phase ---
             model.train()
             train_loss = 0.0
             
-            # Progress bar for training batches
-            train_batch_progress = tqdm(train_loader, desc=f"Epoch {epoch+1} Training", 
-                                      leave=False, unit="batch")
-            
-            for batch_x, batch_y in train_batch_progress:
+            for batch_x, batch_y in train_loader:
                 optimizer.zero_grad()
-                
                 outputs = model(batch_x)
                 loss = criterion(outputs, batch_y)
-                
                 loss.backward()
-                optimizer.step()
                 
+                # Gradient clipping for training stability
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                
+                optimizer.step()
                 train_loss += loss.item()
-                train_batch_progress.set_postfix({"Loss": f"{loss.item():.4f}"})
             
-            # Validation phase
+            # --- Validation Phase ---
             model.eval()
             val_loss = 0.0
             
-            # Progress bar for validation batches
-            val_batch_progress = tqdm(val_loader, desc=f"Epoch {epoch+1} Validation", 
-                                    leave=False, unit="batch")
-            
             with torch.no_grad():
-                for batch_x, batch_y in val_batch_progress:
+                for batch_x, batch_y in val_loader:
                     outputs = model(batch_x)
                     loss = criterion(outputs, batch_y)
                     val_loss += loss.item()
-                    val_batch_progress.set_postfix({"Loss": f"{loss.item():.4f}"})
             
             train_loss /= len(train_loader)
             val_loss /= len(val_loader)
             
-            # Update epoch progress bar
-            epoch_progress.set_postfix({
-                "Train Loss": f"{train_loss:.4f}",
-                "Val Loss": f"{val_loss:.4f}",
-                "Best": f"{best_val_loss:.4f}"
-            })
-            
-            # Save best model
+            # Early stopping and saving the best model
             if val_loss < best_val_loss:
                 best_val_loss = val_loss
                 best_model_state = model.state_dict().copy()
-                epoch_progress.write(f'New best model saved with val_loss: {val_loss:.4f}')
+                patience_counter = 0
+                _log_print(f'Epoch {epoch+1}: New best model found with validation loss: {val_loss:.6f}')
+            else:
+                patience_counter += 1
+            
+            if (epoch + 1) % 5 == 0:
+                _log_print(f'Epoch {epoch+1}/{self.n_epochs}: Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}')
+            
+            # Check for early stopping
+            if patience_counter >= patience and epoch >= 20:
+                _log_print(f"Stopping early at epoch {epoch+1} due to no improvement.")
+                break
         
-        # Load best model
+        # Load the best model state after training is complete
         if best_model_state is not None:
             model.load_state_dict(best_model_state)
-            print(f"\nLoaded best model for {appliance_name} with validation loss: {best_val_loss:.4f}")
+            _log_print(f"Finished training. Loaded best model for {appliance_name} with validation loss: {best_val_loss:.6f}")
     
     def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
+        """Disaggregates a chunk of mains data."""
         if model is not None:
             self.models = model
         
         if do_preprocessing:
-            print("Preprocessing test data...")
-            test_main_list = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=test_main_list,
-                submeters_lst=None,
-                method="test",
-                appliance_params=self.appliance_params,
-                windowing=True
-            )
+            _log_print("Preprocessing test data...")
+            test_main_list = self.call_preprocessing(
+                test_main_list, submeters_lst=None, method='test')
         
         test_predictions = []
         
-        # Progress bar for test chunks
-        chunk_progress = tqdm(test_main_list, desc="Processing test chunks", unit="chunk")
-        
-        for test_mains_df in chunk_progress:
+        for test_mains_df in test_main_list:
             disggregation_dict = {}
             test_main_array = test_mains_df.values.reshape((-1, self.sequence_length, 1))
             test_main_tensor = torch.FloatTensor(test_main_array).permute(0, 2, 1).to(self.device)
             
-            # Progress bar for appliances in each chunk
-            appliance_progress = tqdm(self.models.items(), desc="Disaggregating appliances", 
-                                    leave=False, unit="appliance")
-            
-            for appliance, model in appliance_progress:
-                appliance_progress.set_postfix({"Current": appliance})
-                
+            for appliance, model in self.models.items():
                 model.eval()
                 
-                # Create DataLoader for batched prediction
+                # Create DataLoader for batched predictions
                 test_dataset = TensorDataset(test_main_tensor)
                 test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)
                 
                 predictions = []
-                
-                # Progress bar for prediction batches
-                pred_progress = tqdm(test_loader, desc=f"Predicting {appliance}", 
-                                   leave=False, unit="batch")
-                
                 with torch.no_grad():
-                    for batch_x, in pred_progress:
+                    for batch_x, in test_loader:
                         batch_pred = model(batch_x)
                         predictions.append(batch_pred.cpu().numpy())
                 
                 prediction = np.concatenate(predictions, axis=0)
                 
-                # Average predictions over sequences
-                l = self.sequence_length
-                n = len(prediction) + l - 1
-                sum_arr = np.zeros((n))
-                counts_arr = np.zeros((n))
+                # Average predictions over overlapping windows
+                window_length = self.sequence_length
+                n = len(prediction) + window_length - 1
+                sum_arr = np.zeros(n)
+                counts_arr = np.zeros(n)
                 
-                for i in range(len(prediction)):
-                    sum_arr[i:i + l] += prediction[i].flatten()
-                    counts_arr[i:i + l] += 1
+                for i, p in enumerate(prediction):
+                    sum_arr[i:i+window_length] += p.flatten()
+                    counts_arr[i:i+window_length] += 1
                 
-                for i in range(len(sum_arr)):
-                    sum_arr[i] = sum_arr[i] / counts_arr[i]
+                # Replace zero counts with one to avoid division by zero
+                counts_arr[counts_arr == 0] = 1
+                averaged_prediction = sum_arr / counts_arr
                 
                 # Denormalize predictions
-                prediction = (self.appliance_params[appliance]['mean'] + 
-                            (sum_arr * self.appliance_params[appliance]['std']))
-                valid_predictions = prediction.flatten()
-                valid_predictions = np.where(valid_predictions > 0, valid_predictions, 0)
-                df = pd.Series(valid_predictions)
+                app_mean = self.appliance_params[appliance]['mean']
+                app_std = self.appliance_params[appliance]['std']
+                denormalized_prediction = averaged_prediction * app_std + app_mean
+                
+                # Set negative values to zero
+                denormalized_prediction[denormalized_prediction < 0] = 0
+                df = pd.Series(denormalized_prediction)
                 disggregation_dict[appliance] = df
             
             results = pd.DataFrame(disggregation_dict, dtype='float32')
@@ -429,24 +427,36 @@ def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
         return test_predictions
     
     def return_network(self):
+        """Returns a new, initialized ResNet model."""
         model = ResNetModel(self.sequence_length).to(self.device)
+        
+        # Initialize weights to match TensorFlow's defaults
+        def init_weights(m):
+            if isinstance(m, (nn.Conv1d, nn.Linear)):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm1d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+        
+        model.apply(init_weights)
         return model
         
     def set_appliance_params(self, train_appliances):
-        print("Setting appliance parameters...")
-        
-        # Progress bar for setting appliance parameters
-        param_progress = tqdm(train_appliances, desc="Computing appliance stats", unit="appliance")
-        
-        for (app_name, df_list) in param_progress:
-            param_progress.set_postfix({"Current": app_name})
-            
-            l = np.array(pd.concat(df_list, axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
-            app_max = np.max(l)
-            app_min = np.min(l)
+        """Computes and sets normalization parameters for each appliance."""
+        _log_print("Setting appliance parameters...")
+        
+        for (app_name, df_list) in train_appliances:
+            values = np.concatenate([df.values for df in df_list])
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            app_max = np.max(values)
+            app_min = np.min(values)
             if app_std < 1:
                 app_std = 100
-            self.appliance_params.update({app_name: {'mean': app_mean, 'std': app_std, 
-                                                   'max': app_max, 'min': app_min}})
\ No newline at end of file
+            self.appliance_params[app_name] = {
+                'mean': app_mean, 'std': app_std, 
+                'max': app_max, 'min': app_min
+            }
+            _log_print(f"  {app_name}: mean={app_mean:.2f}, std={app_std:.2f}")
diff --git a/nilmtk_contrib/torch/resnet_classification.py b/nilmtk_contrib/torch/resnet_classification.py
index bdd81c8..909b7e3 100644
--- a/nilmtk_contrib/torch/resnet_classification.py
+++ b/nilmtk_contrib/torch/resnet_classification.py
@@ -1,292 +1,531 @@
-from __future__ import annotations
-import copy, numpy as np, pandas as pd
-from collections import OrderedDict
-from typing import Dict, Any, List, Tuple
-
+from __future__ import print_function, division
+from nilmtk.disaggregate import Disaggregator
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.utils.data import TensorDataset, DataLoader
-from tqdm import tqdm                        
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+import pandas as pd
+import numpy as np
+from collections import OrderedDict
+from nilmtk_contrib.utils.validation import safe_train_test_split as train_test_split
+import copy
 
-from nilmtk.disaggregate import Disaggregator
-from nilmtk_contrib.torch.preprocessing import preprocess
+# Set device
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+from nilmtk_contrib.preprocessing.classification import (
+    appliance_threshold,
+    classification_metadata,
+    loss_weight_metadata,
+)
 
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 class SequenceLengthError(Exception):
     pass
 
-
 class ApplianceNotFoundError(Exception):
     pass
 
-
 class IdentityBlock(nn.Module):
-    """Residual block with identity shortcut connection."""
-    def __init__(self, ch: int, k: int):
-        super().__init__()
-        self.c1 = nn.Conv1d(ch, ch, k, padding="same")
-        self.c2 = nn.Conv1d(ch, ch, k, padding="same")
-        self.c3 = nn.Conv1d(ch, ch, k, padding="same")
-        self.relu = nn.ReLU()
-
+    """
+    An identity block for ResNet, where the input and output dimensions are the same.
+    This implementation mirrors the structure of the original TensorFlow version.
+    """
+    def __init__(self, filters, kernel_size):
+        super(IdentityBlock, self).__init__()
+        
+        # Three convolutional layers, maintaining the channel count
+        self.conv1 = nn.Conv1d(in_channels=filters[0], out_channels=filters[0], 
+                              kernel_size=kernel_size, stride=1, padding='same')
+        self.conv2 = nn.Conv1d(in_channels=filters[0], out_channels=filters[1], 
+                              kernel_size=kernel_size, stride=1, padding='same')
+        self.conv3 = nn.Conv1d(in_channels=filters[1], out_channels=filters[2], 
+                              kernel_size=kernel_size, stride=1, padding='same')
+    
     def forward(self, x):
-        s = x
-        x = self.relu(self.c1(x))
-        x = self.relu(self.c2(x))
-        x = self.c3(x)
-        return self.relu(x + s)
-
-
-class ConvBlock(nn.Module):
-    """Residual block with projection shortcut."""
-    def __init__(self, in_ch: int, mid: int, out: int, k: int):
-        super().__init__()
-        self.c1 = nn.Conv1d(in_ch, mid, k, padding="same")
-        self.c2 = nn.Conv1d(mid,   mid, k, padding="same")
-        self.c3 = nn.Conv1d(mid,   out, k, padding="same")
-        self.proj = nn.Conv1d(in_ch, out, 1)
-        self.relu = nn.ReLU()
+        # Store input for the residual connection
+        identity = x
+        
+        # Forward pass through convolutions with ReLU activations
+        out = F.relu(self.conv1(x))
+        out = F.relu(self.conv2(out))
+        out = self.conv3(out)
+        
+        # Add the residual (identity) connection and apply final activation
+        out += identity
+        out = F.relu(out)
+        
+        return out
 
+class ConvolutionBlock(nn.Module):
+    """
+    A convolutional block for ResNet that can change the input's channel dimension.
+    This implementation mirrors the structure of the original TensorFlow version.
+    """
+    def __init__(self, filters, kernel_size):
+        super(ConvolutionBlock, self).__init__()
+        
+        # Main path with three convolutional layers
+        self.conv1 = nn.Conv1d(in_channels=filters[0], out_channels=filters[0], 
+                              kernel_size=kernel_size, stride=1, padding='same')
+        self.conv2 = nn.Conv1d(in_channels=filters[0], out_channels=filters[1], 
+                              kernel_size=kernel_size, stride=1, padding='same')
+        self.conv3 = nn.Conv1d(in_channels=filters[1], out_channels=filters[2], 
+                              kernel_size=kernel_size, stride=1, padding='same')
+        
+        # Skip connection path to match the output channel dimension
+        self.conv4 = nn.Conv1d(in_channels=filters[0], out_channels=filters[2], 
+                              kernel_size=kernel_size, stride=1, padding='same')
+    
     def forward(self, x):
-        s = self.proj(x)
-        x = self.relu(self.c1(x))
-        x = self.relu(self.c2(x))
-        x = self.c3(x)
-        return self.relu(x + s)
-
+        # Store input for the skip connection
+        identity = x
+        
+        # Forward pass through the main path
+        out = F.relu(self.conv1(x))
+        out = F.relu(self.conv2(out))
+        out = self.conv3(out)
+        
+        # Transform the identity to match the output channels for the residual connection
+        identity = self.conv4(identity)
+        
+        # Add the residual connection and apply final activation
+        out += identity
+        out = F.relu(out)
+        
+        return out
 
-class _ResNetNet(nn.Module):
+class ResNetClassificationNet(nn.Module):
     """
-    ResNet-like architecture for load disaggregation.
-    This model uses convolutional layers to extract features from the input sequence,
-    followed by fully connected layers for regression and classification.
-    The model predicts both the disaggregated load and a binary classification for each time step.
+    A ResNet-based network for NILM that combines a classification subnetwork
+    and a regression subnetwork, mirroring the original TensorFlow implementation.
     """
-    def __init__(self, seq_len: int):
-        super().__init__()
-        self.seq_len = seq_len
-
-        # Classification head
-        self.cls_feat = nn.Sequential(
-            nn.Conv1d(1, 30, 10), nn.ReLU(),
-            nn.Conv1d(30, 30, 8), nn.ReLU(),
-            nn.Conv1d(30, 40, 6), nn.ReLU(),
-            nn.Conv1d(40, 50, 5), nn.ReLU(),
-            nn.Conv1d(50, 50, 5), nn.ReLU(),
-            nn.Conv1d(50, 50, 5), nn.ReLU(),
-            nn.Flatten(),
-            nn.LazyLinear(1024), nn.ReLU()
-        )
-        self.cls_head = nn.Linear(1024, seq_len)
-
-        # Regression branch
-        self.pad   = nn.ConstantPad1d((3, 3), 0)
-        self.conv0 = nn.Conv1d(1, 30, 48, stride=2)
-        self.bn0   = nn.BatchNorm1d(30)
-        self.pool0 = nn.MaxPool1d(3, stride=2)
-        self.block1 = ConvBlock(30, 30, 30, 24)
-        self.block2 = IdentityBlock(30, 12)
-        self.block3 = IdentityBlock(30,  6)
-        self.reg_end = nn.Sequential(
-            nn.Flatten(),
-            nn.LazyLinear(1024), nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Linear(1024, seq_len)
-        )
-
+    def __init__(self, sequence_length):
+        super(ResNetClassificationNet, self).__init__()
+        self.sequence_length = sequence_length
+        
+        # --- CLASSIFICATION SUBNETWORK ---
+        self.cls_conv1 = nn.Conv1d(1, 30, kernel_size=10, padding='valid')
+        self.cls_conv2 = nn.Conv1d(30, 30, kernel_size=8, padding='valid')
+        self.cls_conv3 = nn.Conv1d(30, 40, kernel_size=6, padding='valid')
+        self.cls_conv4 = nn.Conv1d(40, 50, kernel_size=5, padding='valid')
+        self.cls_conv5 = nn.Conv1d(50, 50, kernel_size=5, padding='valid')
+        self.cls_conv6 = nn.Conv1d(50, 50, kernel_size=5, padding='valid')
+        
+        # Calculate flattened size after convolutions
+        conv_output_length = sequence_length - (10-1) - (8-1) - (6-1) - (5-1) - (5-1) - (5-1)
+        self.cls_flatten_size = 50 * conv_output_length
+        
+        self.cls_dense1 = nn.Linear(self.cls_flatten_size, 1024)
+        self.cls_dense2 = nn.Linear(1024, sequence_length)
+        
+        # --- REGRESSION SUBNETWORK (ResNet) ---
+        self.zero_pad = nn.ZeroPad1d(3)
+        self.reg_conv1 = nn.Conv1d(in_channels=1, out_channels=30, kernel_size=48, stride=2)
+        self.reg_bn1 = nn.BatchNorm1d(30)
+        self.reg_maxpool = nn.MaxPool1d(kernel_size=3, stride=2)
+        
+        # ResNet blocks with parameters aligned to the TensorFlow backend.
+        self.conv_block = ConvolutionBlock([30, 30, 30], 24)
+        self.identity_block1 = IdentityBlock([30, 30, 30], 12)
+        self.identity_block2 = IdentityBlock([30, 30, 30], 6)
+        
+        # Calculate the input size for the fully connected layers dynamically
+        self._calculate_fc_input_size()
+        
+        # Fully connected layers for regression
+        self.reg_fc1 = nn.Linear(self.fc_input_size, 1024)
+        self.reg_dropout = nn.Dropout(0.2)
+        self.reg_fc2 = nn.Linear(1024, sequence_length)
+        
+        # Initialize weights
+        self._initialize_weights()
+    
+    def _calculate_fc_input_size(self):
+        """Calculates the input size for the FC layers via a dummy forward pass."""
+        with torch.no_grad():
+            dummy_input = torch.zeros(1, 1, self.sequence_length)
+            x = self._forward_regression_conv_layers(dummy_input)
+            self.fc_input_size = x.flatten(1).shape[1]
+    
+    def _forward_regression_conv_layers(self, x):
+        """Performs the forward pass through the regression conv layers."""
+        x = self.zero_pad(x)
+        x = F.relu(self.reg_conv1(x))
+        x = self.reg_bn1(x)
+        x = F.relu(x)
+        x = self.reg_maxpool(x)
+        
+        x = self.conv_block(x)
+        x = self.identity_block1(x)
+        x = self.identity_block2(x)
+        
+        return x
+    
+    def _initialize_weights(self):
+        """Initializes weights to match TensorFlow's defaults."""
+        for m in self.modules():
+            if isinstance(m, (nn.Conv1d, nn.Linear)):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm1d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+        
+        # Use He normal initialization for the first dense layer in classification
+        nn.init.kaiming_normal_(self.cls_dense1.weight, nonlinearity='relu')
+    
     def forward(self, x):
-        cls = torch.sigmoid(self.cls_head(self.cls_feat(x)))
-        y   = self.pad(x)
-        y   = F.relu(self.bn0(self.conv0(y)))
-        y   = self.pool0(y)
-        y   = self.block1(y)
-        y   = self.block2(y)
-        y   = self.block3(y)
-        reg = self.reg_end(y)
-        return reg * cls, cls  # apply classification mask to regression output
-
+        # Input shape: (batch_size, 1, sequence_length)
+        
+        # --- CLASSIFICATION SUBNETWORK ---
+        cls_x = F.relu(self.cls_conv1(x))
+        cls_x = F.relu(self.cls_conv2(cls_x))
+        cls_x = F.relu(self.cls_conv3(cls_x))
+        cls_x = F.relu(self.cls_conv4(cls_x))
+        cls_x = F.relu(self.cls_conv5(cls_x))
+        cls_x = F.relu(self.cls_conv6(cls_x))
+        cls_x = cls_x.view(cls_x.size(0), -1)  # Flatten
+        cls_x = F.relu(self.cls_dense1(cls_x))
+        classification_output = torch.sigmoid(self.cls_dense2(cls_x))
+        
+        # --- REGRESSION SUBNETWORK ---
+        reg_x = self._forward_regression_conv_layers(x)
+        
+        # Flatten and pass through dense layers
+        reg_x = reg_x.flatten(1)
+        reg_x = F.relu(self.reg_fc1(reg_x))
+        reg_x = self.reg_dropout(reg_x)
+        regression_output = self.reg_fc2(reg_x)
+        
+        # Final output is the element-wise product of the two subnetworks
+        output = regression_output * classification_output
+        
+        return output, classification_output
 
 class ResNet_classification(Disaggregator):
-    """Residual network for NILM with classification-aware output scaling."""
-    def __init__(self, params: Dict[str, Any]):
-        super().__init__()
+    """
+    ResNet-based model with classification for non-intrusive load monitoring.
+    
+    This implementation is based on the paper:
+    "ResNet-based Multi-output Regression for NILM: Towards Enhanced Appliance State Detection"
+    https://arxiv.org/abs/2411.15805v1
+    
+    The model combines ResNet architecture with dual-output design for both appliance 
+    state classification and power consumption regression in energy disaggregation tasks.
+    
+    Architecture Overview:
+    - Classification subnetwork with 1D convolutions for appliance state detection
+    - Regression subnetwork with ResNet blocks for power prediction
+    - Identity and convolution blocks with residual connections
+    - Element-wise multiplication of classification and regression outputs
+    - Multi-output learning for enhanced appliance state detection
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Length of input sequences (default: 99)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
+            - appliance_params (dict): Appliance-specific normalization parameters
+            - mains_params (dict): Mains-specific normalization parameters
+    """
+    def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
         self.MODEL_NAME = "ResNet_classification"
-        self.chunk_wise_training = params.get("chunk_wise_training", True)
-        self.sequence_length = params.get("sequence_length", 99)
+        self.chunk_wise_training = params.get('chunk_wise_training', False)
+        self.sequence_length = params.get('sequence_length', 99)
+        self.n_epochs = params.get('n_epochs', 10)
+        self.models = OrderedDict()
+        self.mains_mean = 1800
+        self.mains_std = 600
+        self.batch_size = params.get('batch_size', 512)
+        self.appliance_params = params.get('appliance_params', {})
+        self.mains_params = params.get('mains_params', {})
+        self.device = device
+        self.classification_threshold = params.get('classification_threshold', params.get('on_power_threshold', 15))
+        self.regression_loss_weight = params.get('regression_loss_weight', 1.0)
+        self.classification_loss_weight = params.get('classification_loss_weight', 1.0)
+        self.classification_metadata = classification_metadata(
+            self.appliance_params,
+            self.classification_threshold,
+        )
+        self.loss_weight_metadata = loss_weight_metadata(
+            self.regression_loss_weight,
+            self.classification_loss_weight,
+        )
+        
         if self.sequence_length % 2 == 0:
-            raise SequenceLengthError("sequence_length must be odd")
-
-        self.n_epochs   = params.get("n_epochs",   10)
-        self.batch_size = params.get("batch_size", 512)
+            raise SequenceLengthError("Sequence length must be odd!")
 
-        self.mains_mean, self.mains_std = 1800, 600
-        self.appliance_params: Dict[str, Dict[str, float]] = {}
+    def return_network(self):
+        """Returns a new instance of the ResNetClassificationNet."""
+        return ResNetClassificationNet(self.sequence_length).to(self.device)
 
-        self.models: "OrderedDict[str,_ResNetNet]" = OrderedDict()
-        self.optims:  Dict[str, torch.optim.Optimizer] = {}
-        self.best:    Dict[str, float] = {}
+    def classify(self, classify_appliance):
+        """Creates binary on/off classification labels for appliances."""
+        appliance_on_off = []
 
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        for app_index, (appliance_name, on_off_list) in enumerate(classify_appliance):
+            threshold = appliance_threshold(
+                self.appliance_params,
+                appliance_name,
+                self.classification_threshold,
+            )
+            classification_appliance_dfs = []
+            for appliance in on_off_list:
+                n = self.sequence_length
+                units_to_pad = n // 2
+                appliance_copy = appliance.copy()
+                appliance_copy[appliance_copy <= threshold] = 0
+                appliance_copy[appliance_copy > threshold] = 1
+                new_app_readings = appliance_copy.values.flatten()
+                new_app_readings = np.pad(new_app_readings, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_app_readings = np.array([new_app_readings[i:i + n] for i in range(len(new_app_readings) - n + 1)])
+                classification_appliance_dfs.append(pd.DataFrame(new_app_readings))
+            appliance_on_off.append((appliance_name, classification_appliance_dfs))
+        return appliance_on_off
+
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """Preprocesses data by windowing and normalizing."""
+        if method == 'train':
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+
+            appliance_list = []
+            for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
+                if app_name in self.appliance_params:
+                    self.appliance_params[app_name]['mean']
+                    self.appliance_params[app_name]['std']
+                    app_min = self.appliance_params[app_name]['min']
+                    app_max = self.appliance_params[app_name]['max']
+                else:
+                    raise ApplianceNotFoundError(f"Parameters for appliance '{app_name}' not found!")
+
+                processed_app_dfs = []
+                for app_df in app_df_lst:
+                    new_app_readings = app_df.values.flatten()
+                    new_app_readings = np.pad(new_app_readings, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                    new_app_readings = np.array([new_app_readings[i:i + n] for i in range(len(new_app_readings) - n + 1)])
+                    # Normalize using min-max scaling
+                    new_app_readings = (new_app_readings - app_min) / (app_max - app_min)
+                    processed_app_dfs.append(pd.DataFrame(new_app_readings))
+
+                appliance_list.append((app_name, processed_app_dfs))
+
+            return processed_mains_lst, appliance_list
+
+        else:
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                new_mains = new_mains.reshape((-1, self.sequence_length))
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            return processed_mains_lst
+
+    def set_mains_params(self, train_main):
+        """Computes and sets normalization parameters for the mains data."""
+        values = np.concatenate([mains.values.flatten() for mains in train_main])
+        self.mains_params.update({
+            'mean': np.mean(values),
+            'std': np.std(values),
+            'min': np.min(values),
+            'max': np.max(values)
+        })
+
+    def set_appliance_params(self, train_appliances):
+        """Computes and sets normalization parameters for each appliance."""
+        for (app_name, df_list) in train_appliances:
+            values = np.concatenate([df.values for df in df_list])
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            app_max = np.max(values)
+            app_min = np.min(values)
+            if app_std < 1:
+                app_std = 100
+            self.appliance_params[app_name] = {
+                'mean': app_mean, 'std': app_std, 
+                'min': app_min, 'max': app_max
+            }
 
-    def partial_fit(self, mains, appliances, do_preprocessing=True, **_):
+    def partial_fit(self, train_main, train_appliances, do_preprocessing=True, **load_kwargs):
+        """Trains the model on a chunk of data."""
+        _log_print("...............ResNet_classification partial_fit running...............")
+        
         if not self.appliance_params:
-            self.set_appliance_params(appliances)
-        self._set_mains_params(mains)
+            self.set_appliance_params(train_appliances)
+        if not self.mains_params:
+            self.set_mains_params(train_main)
 
         if do_preprocessing:
-            cls_labels = self._make_on_off(copy.deepcopy(appliances))
-            mains, appliances = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=mains,
-                submeters_lst=appliances,
-                method="train",
-                appliance_params=self.appliance_params,
-                windowing=False
-            )
-
-        X = torch.tensor(pd.concat(mains).values, dtype=torch.float32).unsqueeze(1)  # [batch, seq_len, 1]
-        N = X.size(0)  # number of samples
-        perm = torch.randperm(N) 
-        val_idx, tr_idx = perm[:int(0.15 * N)], perm[int(0.15 * N):]
-        X_tr, X_val = X[tr_idx].to(self.device), X[val_idx].to(self.device)
-
-        y_reg, y_cls = {}, {}
-        for app, dfs in appliances:
-            y_reg[app] = torch.tensor(pd.concat(dfs).values, dtype=torch.float32)
-        for app, dfs in cls_labels:
-            y_cls[app] = torch.tensor(pd.concat(dfs).values, dtype=torch.float32)
-
-        mse, bce = nn.MSELoss(), nn.BCELoss()
-
-        for app in y_reg:
-            y_tr = y_reg[app][tr_idx].to(self.device)
-            y_val = y_reg[app][val_idx].to(self.device)
-            c_tr = y_cls[app][tr_idx].to(self.device)
-            c_val = y_cls[app][val_idx].to(self.device)
-
-            if app not in self.models:
-                net = _ResNetNet(self.sequence_length).to(self.device)
-                self.models[app] = net
-                self.optims[app] = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
-                self.best[app]   = np.inf
-
-            net, opt = self.models[app], self.optims[app]
-            loader = DataLoader(TensorDataset(X_tr, y_tr, c_tr),
-                                batch_size=self.batch_size, shuffle=True)
-
-            # training loop
-            for ep in range(self.n_epochs):
-                net.train()
-                ep_bar = tqdm(loader,
-                              desc=f"{app} ▏epoch {ep+1}/{self.n_epochs}",
-                              unit="batch", leave=False)   # live bar
-                running = 0.0
-                for xb, yb, cb in ep_bar:
-                    opt.zero_grad()
-                    pr, pc = net(xb)
-                    loss = mse(pr, yb) + bce(pc, cb)
-                    loss.backward()
-                    opt.step()
-                    running += loss.item()
-                    ep_bar.set_postfix(loss=f"{loss.item():.4f}")  # update
-
-                avg_loss = running / len(loader)
-
-                # validation
-                net.eval()
-                with torch.no_grad():
-                    vr, vc = net(X_val)
-                    v_loss = mse(vr, y_val).item() + bce(vc, c_val).item()
-
-                tqdm.write(f"[{app}] Epoch {ep+1}/{self.n_epochs} | " f"Train Loss: {avg_loss:.4f} | Val Loss: {v_loss:.4f}")   
-
-                if v_loss < self.best[app]:
-                    self.best[app] = v_loss
-                    torch.save(net.state_dict(), f"resnet_cls-{app}.pth")
-
-            net.load_state_dict(torch.load(f"resnet_cls-{app}.pth", map_location=self.device))
-
-    def disaggregate_chunk(self, mains, model=None, do_preprocessing=True):
+            # Create classification labels
+            classify_appliance = copy.deepcopy(train_appliances)
+            classification = self.classify(classify_appliance)
+
+            # Preprocess regression and classification data
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+        
+        train_main = pd.concat(train_main, axis=0).values.reshape((-1, self.sequence_length, 1))
+
+        # Process appliance data for regression
+        new_train_appliances = []
+        for app_name, app_dfs in train_appliances:
+            app_df_values = pd.concat(app_dfs, axis=0).values.reshape((-1, self.sequence_length))
+            new_train_appliances.append((app_name, app_df_values))
+        train_appliances = new_train_appliances
+
+        # Process appliance data for classification
+        new_train_appliances_classification = {}
+        for app_name, app_df in classification:
+            app_df_values = pd.concat(app_df, axis=0).values.reshape((-1, self.sequence_length))
+            new_train_appliances_classification[app_name] = app_df_values
+        
+        for appliance_name, power in train_appliances:
+            if appliance_name not in self.models:
+                _log_print("First time training for", appliance_name)
+                self.models[appliance_name] = self.return_network()
+            else:
+                _log_print("Retraining model for", appliance_name)
+
+            model = self.models[appliance_name]
+            if train_main.size > 10:
+                    # Combine regression and classification targets
+                    power_df = pd.DataFrame(power)
+                    classification_df = pd.DataFrame(new_train_appliances_classification[appliance_name])
+                    power_combined = pd.concat([power_df, classification_df], axis=1).values
+
+                    # Split data into training and validation sets
+                    train_x, v_x, train_y_combined, v_y_combined = train_test_split(
+                        train_main, power_combined, test_size=0.15, random_state=10)
+
+                    train_y = train_y_combined[:, :self.sequence_length]
+                    v_y = v_y_combined[:, :self.sequence_length]
+                    appliance_train_classification = train_y_combined[:, self.sequence_length:]
+                    appliance_val_classification = v_y_combined[:, self.sequence_length:]
+
+                    # Convert to PyTorch tensors
+                    train_x = torch.tensor(train_x, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+                    v_x = torch.tensor(v_x, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+                    train_y = torch.tensor(train_y, dtype=torch.float32).to(self.device)
+                    v_y = torch.tensor(v_y, dtype=torch.float32).to(self.device)
+                    appliance_train_classification = torch.tensor(appliance_train_classification, dtype=torch.float32).to(self.device)
+                    appliance_val_classification = torch.tensor(appliance_val_classification, dtype=torch.float32).to(self.device)
+
+                    # Setup optimizer and loss functions
+                    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+                    mse_loss = nn.MSELoss()
+                    bce_loss = nn.BCELoss()
+
+                    best_val_loss = float('inf')
+                    filepath = checkpoint_path(".pth")
+
+                    # Training loop
+                    for epoch in range(self.n_epochs):
+                        model.train()
+                        
+                        train_dataset = TensorDataset(train_x, train_y, appliance_train_classification)
+                        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
+
+                        epoch_losses = []
+                        for batch_x, batch_y, batch_c in train_loader:
+                            optimizer.zero_grad()
+                            output, classification_output = model(batch_x)
+                            
+                            # Combined loss for regression and classification
+                            loss = (
+                                self.regression_loss_weight * mse_loss(output, batch_y)
+                                + self.classification_loss_weight * bce_loss(classification_output, batch_c)
+                            )
+                            
+                            loss.backward()
+                            optimizer.step()
+                            epoch_losses.append(loss.item())
+
+                        # Validation
+                        model.eval()
+                        with torch.no_grad():
+                            val_output, val_classification = model(v_x)
+                            val_loss = (
+                                self.regression_loss_weight * mse_loss(val_output, v_y)
+                                + self.classification_loss_weight * bce_loss(val_classification, appliance_val_classification)
+                            )
+
+                        avg_train_loss = np.mean(epoch_losses)
+                        _log_print(f"Epoch {epoch+1}/{self.n_epochs} - loss: {avg_train_loss:.4f} - val_loss: {val_loss.item():.4f}")
+
+                        # Save the best model
+                        if val_loss < best_val_loss:
+                            best_val_loss = val_loss
+                            torch.save(model.state_dict(), filepath)
+                            _log_print(f"Validation loss improved, saving model to {filepath}")
+
+                    # Load best weights
+                    model.load_state_dict(torch.load(filepath, map_location=self.device))
+
+    def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
+        """Disaggregates a chunk of mains data."""
         if model is not None:
             self.models = model
-        if do_preprocessing:
-            mains = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=mains,
-                submeters_lst=None,
-                method="test",
-                appliance_params=self.appliance_params,
-                windowing=False
-            )
 
-        L = self.sequence_length
-        out = []
-        for m in mains:
-            X = torch.tensor(m.values, dtype=torch.float32).unsqueeze(1).to(self.device)
-            disc = {}
-            for app, net in self.models.items():
-                net.eval()
+        if do_preprocessing:
+            test_main_list = self.call_preprocessing(
+                test_main_list, submeters_lst=None, method='test')
+
+        test_predictions = []
+        for test_mains_df in test_main_list:
+            disggregation_dict = {}
+            test_main_array = test_mains_df.values.reshape((-1, self.sequence_length, 1))
+            test_main_tensor = torch.tensor(test_main_array, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+
+            for appliance in self.models:
+                model = self.models[appliance]
+                model.eval()
+                
                 with torch.no_grad():
-                    pr, _ = net(X)  # pr: [batch, seq_len]
-                    pr = pr.cpu().numpy()
-
-                def overlap(wins):
-                    # Coverts overlapping windows into continuous sequence
-                    s, c = np.zeros(len(wins)+L-1), np.zeros(len(wins)+L-1)  # sum and count arrays
-                    for i in range(len(wins)):
-                        s[i:i+L] += wins[i].flatten()
-                        c[i:i+L] += 1
-                    return s / c
-
-                power = overlap(pr)
-                p = self.appliance_params[app]
-                power = np.clip(p["min"] + power*(p["max"]-p["min"]), 0, None)
-                disc[app] = pd.Series(power, dtype="float32")
-            out.append(pd.DataFrame(disc, dtype="float32"))
-        return out
-
-    def _make_on_off(self, apps):
-        """Convert appliance data to binary on/off labels."""
-        TH, n, pad = 15, self.sequence_length, self.sequence_length//2
-        res = []
-        for app, dfs in apps:
-            lbls = []
-            for df in dfs:
-                a = df.copy()
-                a[a<=TH] = 0; a[a>TH] = 1
-                v = np.pad(a.values.flatten(), (pad,pad))
-                w = np.array([v[i:i+n] for i in range(len(v)-n+1)])
-                lbls.append(pd.DataFrame(w))
-            res.append((app, lbls))
-        return res
-
-    def set_appliance_params(self, apps):
-        """Compute mean, std, min, max for each appliance."""
-        for app, dfs in apps:
-            data = np.concatenate([d.values.flatten() for d in dfs])
-            self.appliance_params[app] = {
-                "mean": data.mean(),
-                "std":  max(data.std(), 1.0),
-                "min":  data.min(),
-                "max":  data.max()
-            }
-
-    def _set_mains_params(self, mains):
-        """Compute mean and std for mains data."""
-        data = np.concatenate([m.values.flatten() for m in mains])
-        self.mains_mean, self.mains_std = data.mean(), data.std()
-
-    # NILMTK wrappers
-    def train(self, mains, apps, **kw):
-        return self.partial_fit(mains, apps, **kw)
-
-    def disaggregate(self, mains, store):
-        preds = self.disaggregate_chunk(mains)
-        for i, df in enumerate(preds):
-            for col in df.columns:
-                store.put(f"/building1/elec/meter{i+1}/{col}", df[col])
+                    prediction_output, _ = model(test_main_tensor)
+                    prediction = prediction_output.cpu().numpy()
+                
+                # Average predictions over overlapping windows
+                window_length = self.sequence_length
+                n = len(prediction)
+                sum_arr = np.zeros(n + window_length - 1)
+                counts_arr = np.zeros(n + window_length - 1)
+                for i in range(n):
+                    sum_arr[i:i+window_length] += prediction[i]
+                    counts_arr[i:i+window_length] += 1
+                for i in range(len(counts_arr)):
+                    if counts_arr[i] == 0:
+                        counts_arr[i] = 1
+                averaged_prediction = sum_arr / counts_arr
+                
+                # Denormalize the predictions
+                app_min = self.appliance_params[appliance]['min']
+                app_max = self.appliance_params[appliance]['max']
+                prediction = averaged_prediction * (app_max - app_min) + app_min
+                prediction[prediction < 0] = 0
+                
+                df = pd.Series(prediction)
+                disggregation_dict[appliance] = df
+            results = pd.DataFrame(disggregation_dict, dtype='float32')
+            test_predictions.append(results)
+        return test_predictions
+
+    def classification_output_plot(self, prediction_classification, appliance):
+        """Optional plotting function for classification output (matching TensorFlow)"""
+        pass  # Placeholder for plotting functionality
diff --git a/nilmtk_contrib/torch/rnn.py b/nilmtk_contrib/torch/rnn.py
index 52d3789..b10bfb4 100644
--- a/nilmtk_contrib/torch/rnn.py
+++ b/nilmtk_contrib/torch/rnn.py
@@ -4,26 +4,12 @@
 from nilmtk.disaggregate import Disaggregator
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torch.utils.data import Dataset, DataLoader, TensorDataset
-from sklearn.model_selection import train_test_split
-from tqdm import tqdm
-import random
-import os
-from nilmtk_contrib.torch.preprocessing import preprocess
+from torch.utils.data import TensorDataset, DataLoader
 
-# Set random seeds for reproducibility across runs
-random.seed(10)
-np.random.seed(10)
-torch.manual_seed(10)
-if torch.cuda.is_available():
-    torch.cuda.manual_seed(10)
-    torch.cuda.manual_seed_all(10)
-
-# Use GPU if available, otherwise fall back to CPU
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
 
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 class SequenceLengthError(Exception):
     pass
 
@@ -32,347 +18,287 @@ class ApplianceNotFoundError(Exception):
 
 class RNNModel(nn.Module):
     """
-    Neural network combining CNN feature extraction and bidirectional LSTMs
-    for NILM energy disaggregation.
+    An RNN-based model for NILM, with an architecture designed to mirror the
+    original TensorFlow implementation.
     """
     def __init__(self, sequence_length):
         super(RNNModel, self).__init__()
         self.sequence_length = sequence_length
         
-        # 1D CNN for initial feature extraction from raw power sequence
-        self.conv1d = nn.Conv1d(
-            in_channels=1, 
-            out_channels=16, 
-            kernel_size=4, 
-            stride=1, 
-            padding=2  # Maintain sequence length
-        )
-        
-        # First bidirectional LSTM layer
-        self.lstm1 = nn.LSTM(
-            input_size=16,
-            hidden_size=128,
-            num_layers=1,
-            batch_first=True,
-            bidirectional=True
-        )
-        
-        # Second bidirectional LSTM layer for deeper feature learning
-        self.lstm2 = nn.LSTM(
-            input_size=256,  # 128 * 2 (bidirectional)
-            hidden_size=256,
-            num_layers=1,
-            batch_first=True,
-            bidirectional=True
-        )
+        # Layers are defined to match the TensorFlow architecture
+        self.conv1d = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=4, 
+                                stride=1, padding=2) # 'same' padding
+        self.lstm1 = nn.LSTM(input_size=16, hidden_size=128, batch_first=True, bidirectional=True)
+        self.lstm2 = nn.LSTM(input_size=256, hidden_size=256, batch_first=True, bidirectional=True)
+        self.fc1 = nn.Linear(512, 128)
+        self.fc2 = nn.Linear(128, 1)
         
-        # Final fully connected layers for prediction
-        self.fc1 = nn.Linear(512, 128)  # 256 * 2 (bidirectional)
-        self.fc2 = nn.Linear(128, 1)   # Output single power value
-        
-        # Dropout for regularization
-        self.dropout = nn.Dropout(0.1)
+        self._init_weights()
+    
+    def _init_weights(self):
+        """Initializes weights to match TensorFlow's default initializations."""
+        # Use Xavier uniform for Conv, LSTM, and Linear layers by default
+        for m in self.modules():
+            if isinstance(m, (nn.Conv1d, nn.Linear)):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LSTM):
+                for name, param in m.named_parameters():
+                    if 'weight' in name:
+                        nn.init.xavier_uniform_(param)
+                    elif 'bias' in name:
+                        nn.init.zeros_(param)
     
     def forward(self, x):
-        # Input shape: (batch_size, sequence_length, 1)
-        # Rearrange for Conv1D: (batch_size, channels, sequence_length)
-        x = x.permute(0, 2, 1)  # (batch_size, 1, sequence_length)
+        # Input shape: (batch, seq_len, 1) -> permute for Conv1D
+        x = x.permute(0, 2, 1)
         
-        # Extract features using 1D convolution
-        x = self.conv1d(x)  # (batch_size, 16, sequence_length)
+        # Feature extraction
+        x = self.conv1d(x)
         
-        # Rearrange back for LSTM: (batch_size, sequence_length, features)
-        x = x.permute(0, 2, 1)  # (batch_size, sequence_length, 16)
+        # Permute for LSTM layers
+        x = x.permute(0, 2, 1)
         
-        # Process through bidirectional LSTM layers
-        x, _ = self.lstm1(x)  # (batch_size, sequence_length, 256)
-        x = self.dropout(x)
+        # Sequence processing
+        x, _ = self.lstm1(x)
+        x, _ = self.lstm2(x)
         
-        x, _ = self.lstm2(x)  # (batch_size, sequence_length, 512)
-        
-        # Use only the last time step output
-        x = x[:, -1, :]  # (batch_size, 512)
+        # In the original TF model, only the output of the last time step is used.
+        x = x[:, -1, :]
         
         # Final prediction layers
-        x = torch.tanh(self.fc1(x))  # (batch_size, 128)
-        x = self.dropout(x)
-        x = self.fc2(x)  # (batch_size, 1)
+        x = torch.tanh(self.fc1(x))
+        x = self.fc2(x)
         
         return x
 
 class RNN(Disaggregator):
     """
-    NILM disaggregator using RNN without attention mechanism.
-    Inherits from NILMTK's Disaggregator base class.
-    """
+    RNN disaggregator for Non-Intrusive Load Monitoring (NILM).
+    
+    Based on "Neural NILM: Deep Neural Networks Applied to Energy Disaggregation"
+    (https://arxiv.org/abs/1507.06594). This implementation uses a convolutional
+    layer followed by bidirectional LSTM layers to learn temporal patterns in
+    aggregate power consumption data and predict individual appliance usage.
     
+    The model architecture consists of:
+    1. 1D Convolutional layer for feature extraction from power sequences
+    2. Two bidirectional LSTM layers for learning long-term dependencies
+    3. Fully connected layers for final power regression
+    
+    Args:
+        params (dict): Dictionary containing model hyperparameters:
+            - sequence_length (int): Length of input sequences (default: 19)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - appliance_params (dict): Appliance-specific parameters
+            - mains_mean (float): Mean normalization for mains power (default: 1800)
+            - mains_std (float): Standard deviation for mains power (default: 600)
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
+    """
     def __init__(self, params):
-        """Initialize the disaggregator with hyperparameters"""
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        """Initializes the disaggregator and its hyperparameters."""
         self.MODEL_NAME = "RNN"
-        self.models = OrderedDict()  # Store separate models for each appliance
-        self.file_prefix = "{}-temp-weights".format(self.MODEL_NAME.lower())
+        self.models = OrderedDict()
+        self.file_prefix = f"{self.MODEL_NAME.lower()}-temp-weights"
         
-        # Extract hyperparameters from params dict
         self.chunk_wise_training = params.get('chunk_wise_training', False)
         self.sequence_length = params.get('sequence_length', 19)
         self.n_epochs = params.get('n_epochs', 10)
         self.batch_size = params.get('batch_size', 512)
-        self.appliance_params = params.get('appliance_params', {})  # Normalization stats
+        self.appliance_params = params.get('appliance_params', {})
         self.mains_mean = params.get('mains_mean', 1800)
         self.mains_std = params.get('mains_std', 600)
-        self.device = device
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         
-        # Sequence length must be odd for proper windowing
         if self.sequence_length % 2 == 0:
-            print("Sequence length should be odd!")
-            raise SequenceLengthError
-    
+            raise SequenceLengthError("Sequence length must be odd for proper windowing.")
+
     def partial_fit(self, train_main, train_appliances, do_preprocessing=True, current_epoch=0, **load_kwargs):
-        """Train models on a chunk of data (supports incremental learning)"""
-        
-        # Compute appliance-specific normalization parameters if not provided
-        if len(self.appliance_params) == 0:
+        """Trains the model on a chunk of data."""
+        if not self.appliance_params:
             self.set_appliance_params(train_appliances)
+
+        _log_print("...............RNN partial_fit running...............")
         
-        print("...............RNN partial_fit running...............")
-        
-        # Preprocess data: windowing, normalization, etc.
         if do_preprocessing:
-            print("Preprocessing data...")
-            train_main, train_appliances = preprocess(
-                sequence_length=self.sequence_length,
-                mains_std=self.mains_std,
-                mains_mean=self.mains_mean,
-                mains_lst=train_main,
-                submeters_lst=train_appliances,
-                method="train",
-                appliance_params=self.appliance_params,
-                windowing=False
-            )
-        
-        # Prepare main power data for training
-        train_main = pd.concat(train_main, axis=0)
-        train_main = train_main.values.reshape((-1, self.sequence_length, 1))
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+
+        # Prepare data for training
+        train_main = pd.concat(train_main, axis=0).values.reshape((-1, self.sequence_length, 1))
         
-        # Prepare appliance power data
         new_train_appliances = []
-        for app_name, app_df in train_appliances:
-            app_df = pd.concat(app_df, axis=0)
-            app_df_values = app_df.values.reshape((-1, 1))
+        for app_name, app_dfs in train_appliances:
+            app_df_values = pd.concat(app_dfs, axis=0).values.reshape((-1, 1))
             new_train_appliances.append((app_name, app_df_values))
         train_appliances = new_train_appliances
-        
-        print(f"Training data shape: {train_main.shape}")
-        
-        # Train a separate model for each appliance
-        appliance_progress = tqdm(train_appliances, desc="Training appliances", unit="appliance")
-        
-        for appliance_name, power in appliance_progress:
-            appliance_progress.set_postfix({"Current": appliance_name})
-            
-            # Create new model if this appliance hasn't been seen before
+
+        for appliance_name, power in train_appliances:
             if appliance_name not in self.models:
-                print(f"\nFirst model training for {appliance_name}")
+                _log_print(f"First time training for {appliance_name}")
                 self.models[appliance_name] = self.return_network()
             else:
-                print(f"\nStarted Retraining model for {appliance_name}")
-            
+                _log_print(f"Retraining model for {appliance_name}")
+
             model = self.models[appliance_name]
-            
-            # Train only if we have sufficient data
-            if train_main.size > 0:
-                if len(train_main) > 10:
-                    # Convert to PyTorch tensors and move to device
-                    train_x = torch.FloatTensor(train_main).to(self.device)
-                    train_y = torch.FloatTensor(power).to(self.device)
+            if train_main.size > 10:
+                    filepath = checkpoint_path(".pt")
                     
-                    # Split data into training and validation sets
-                    train_x_split, val_x_split, train_y_split, val_y_split = train_test_split(
-                        train_x.cpu().numpy(), train_y.cpu().numpy(), 
-                        test_size=0.15, random_state=42
-                    )
+                    # Convert to PyTorch Tensors
+                    train_main_tensor = torch.tensor(train_main, dtype=torch.float32)
+                    power_tensor = torch.tensor(power, dtype=torch.float32).squeeze()
                     
-                    # Convert back to tensors and move to device
-                    train_x_split = torch.FloatTensor(train_x_split).to(self.device)
-                    val_x_split = torch.FloatTensor(val_x_split).to(self.device)
-                    train_y_split = torch.FloatTensor(train_y_split).to(self.device)
-                    val_y_split = torch.FloatTensor(val_y_split).to(self.device)
+                    # Use the last 15% of data for validation to mirror TensorFlow's behavior
+                    val_size = max(1, int(0.15 * len(train_main_tensor))) if len(train_main_tensor) > 1 else 0
+                    train_size = len(train_main_tensor) - val_size
                     
-                    # Create PyTorch DataLoaders for batch processing
-                    train_dataset = TensorDataset(train_x_split, train_y_split)
-                    val_dataset = TensorDataset(val_x_split, val_y_split)
+                    train_x = train_main_tensor[:train_size].to(self.device)
+                    val_x = train_main_tensor[train_size:].to(self.device)
+                    train_y = power_tensor[:train_size].to(self.device)
+                    val_y = power_tensor[train_size:].to(self.device)
+                    
+                    # Optimizer and loss function, with parameters matching TensorFlow
+                    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-07)
+                    criterion = nn.MSELoss()
+                    
+                    best_val_loss = float('inf')
+                    
+                    # Create DataLoader for batching
+                    train_dataset = TensorDataset(train_x, train_y)
                     train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
-                    val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False)
                     
-                    # Train the model
-                    self.train_model(model, train_loader, val_loader, appliance_name, current_epoch)
-    
-    def train_model(self, model, train_loader, val_loader, appliance_name, current_epoch):
-        """Train a single appliance model with early stopping based on validation loss"""
-        optimizer = optim.Adam(model.parameters(), lr=0.001)
-        criterion = nn.MSELoss()
-        
-        best_val_loss = float('inf')
-        best_model_state = None
-        
-        epoch_progress = tqdm(range(self.n_epochs), desc=f"Training {appliance_name}", unit="epoch")
-        
-        for epoch in epoch_progress:
-            # Training phase
-            model.train()
-            train_loss = 0.0
-            
-            train_batch_progress = tqdm(train_loader, desc=f"Epoch {epoch+1} Training", 
-                                      leave=False, unit="batch")
-            
-            for batch_x, batch_y in train_batch_progress:
-                optimizer.zero_grad()
-                
-                outputs = model(batch_x)
-                loss = criterion(outputs.squeeze(), batch_y.squeeze())
-                
-                loss.backward()
-                optimizer.step()
-                
-                train_loss += loss.item()
-                train_batch_progress.set_postfix({"Loss": f"{loss.item():.4f}"})
-            
-            # Validation phase
-            model.eval()
-            val_loss = 0.0
-            
-            val_batch_progress = tqdm(val_loader, desc=f"Epoch {epoch+1} Validation", 
-                                    leave=False, unit="batch")
-            
-            with torch.no_grad():
-                for batch_x, batch_y in val_batch_progress:
-                    outputs = model(batch_x)
-                    loss = criterion(outputs.squeeze(), batch_y.squeeze())
-                    val_loss += loss.item()
-                    val_batch_progress.set_postfix({"Loss": f"{loss.item():.4f}"})
-            
-            # Calculate average losses
-            train_loss /= len(train_loader)
-            val_loss /= len(val_loader)
-            
-            epoch_progress.set_postfix({
-                "Train Loss": f"{train_loss:.4f}",
-                "Val Loss": f"{val_loss:.4f}",
-                "Best": f"{best_val_loss:.4f}"
-            })
-            
-            # Save best model based on validation loss
-            if val_loss < best_val_loss:
-                best_val_loss = val_loss
-                best_model_state = model.state_dict().copy()
-                epoch_progress.write(f'New best model saved with val_loss: {val_loss:.4f}')
-                
-                # Save model checkpoint
-                filepath = f"{self.file_prefix}-{appliance_name.replace(' ', '_')}-epoch{current_epoch}.pth"
-                torch.save(best_model_state, filepath)
-        
-        # Load the best model weights
-        if best_model_state is not None:
-            model.load_state_dict(best_model_state)
-            print(f"\nLoaded best model for {appliance_name} with validation loss: {best_val_loss:.4f}")
-    
+                    for epoch in range(self.n_epochs):
+                        # --- Training Phase ---
+                        model.train()
+                        train_loss = 0.0
+                        
+                        for batch_x, batch_y in train_loader:
+                            optimizer.zero_grad()
+                            outputs = model(batch_x).squeeze(-1)
+                            loss = criterion(outputs, batch_y)
+                            loss.backward()
+                            optimizer.step()
+                            train_loss += loss.item()
+                        
+                        train_loss /= len(train_loader)
+                        
+                        # --- Validation Phase ---
+                        model.eval()
+                        with torch.no_grad():
+                            val_outputs = model(val_x).squeeze(-1)
+                            val_loss = criterion(val_outputs, val_y).item()
+                        
+                        # Save the best model based on validation loss
+                        if val_loss < best_val_loss:
+                            best_val_loss = val_loss
+                            torch.save(model.state_dict(), filepath)
+                            _log_print(f'Epoch {epoch+1}/{self.n_epochs} - loss: {train_loss:.4f} - val_loss: {val_loss:.4f}')
+                        
+                    # Load the best performing model
+                    model.load_state_dict(torch.load(filepath))
+
     def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
-        """Disaggregate power consumption for each appliance from aggregate mains data"""
-        
+        """Disaggregates a chunk of mains data."""
         if model is not None:
             self.models = model
-        
-        # Preprocess test data similar to training data
+
         if do_preprocessing:
-            print("Preprocessing test data...")
-            test_main_list = preprocess(
-                sequence_length=self.sequence_length,
-                mains_lst=test_main_list,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                submeters_lst=None,
-                method="test",
-                appliance_params=self.appliance_params,
-                windowing=False
-            )
-        
+            test_main_list = self.call_preprocessing(
+                test_main_list, submeters_lst=None, method='test')
+
         test_predictions = []
-        
-        chunk_progress = tqdm(test_main_list, desc="Processing test chunks", unit="chunk")
-        
-        # Process each chunk of test data
-        for test_main in chunk_progress:
-            test_main = test_main.values
-            test_main = test_main.reshape((-1, self.sequence_length, 1))
-            test_main_tensor = torch.FloatTensor(test_main).to(self.device)
-            
+        for test_mains_df in test_main_list:
+            test_main_array = test_mains_df.values.reshape((-1, self.sequence_length, 1))
             disggregation_dict = {}
             
-            appliance_progress = tqdm(self.models.items(), desc="Disaggregating appliances", 
-                                    leave=False, unit="appliance")
-            
-            # Get predictions from each appliance model
-            for appliance, model in appliance_progress:
-                appliance_progress.set_postfix({"Current": appliance})
+            for appliance, model in self.models.items():
+                test_tensor = torch.tensor(test_main_array, dtype=torch.float32).to(self.device)
                 
                 model.eval()
-                
-                # Create DataLoader for batched inference
-                test_dataset = TensorDataset(test_main_tensor)
-                test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)
-                
-                predictions = []
-                
-                pred_progress = tqdm(test_loader, desc=f"Predicting {appliance}", 
-                                   leave=False, unit="batch")
-                
-                # Generate predictions
                 with torch.no_grad():
-                    for batch_x, in pred_progress:
-                        batch_pred = model(batch_x)
-                        predictions.append(batch_pred.cpu().numpy())
-                
-                prediction = np.concatenate(predictions, axis=0)
+                    # Process in batches to manage memory
+                    predictions = []
+                    for i in range(0, len(test_tensor), self.batch_size):
+                        batch = test_tensor[i:i + self.batch_size]
+                        batch_pred = model(batch).cpu().numpy()
+                        predictions.append(batch_pred)
+                    prediction = np.concatenate(predictions, axis=0)
                 
-                # Denormalize predictions back to original power scale
-                prediction = (self.appliance_params[appliance]['mean'] + 
-                            prediction * self.appliance_params[appliance]['std'])
+                # Denormalize the prediction
+                app_mean = self.appliance_params[appliance]['mean']
+                app_std = self.appliance_params[appliance]['std']
+                denormalized_prediction = app_mean + (prediction * app_std)
                 
-                # Ensure non-negative power values
-                valid_predictions = prediction.flatten()
-                valid_predictions = np.where(valid_predictions > 0, valid_predictions, 0)
-                df = pd.Series(valid_predictions)
+                # Set negative values to zero
+                denormalized_prediction[denormalized_prediction < 0] = 0
+                df = pd.Series(denormalized_prediction.flatten())
                 disggregation_dict[appliance] = df
-            
-            # Combine all appliance predictions for this chunk
+                
             results = pd.DataFrame(disggregation_dict, dtype='float32')
             test_predictions.append(results)
-        
         return test_predictions
-    
+
     def return_network(self):
-        """Factory method to create a new RNN model instance"""
+        """Returns a new, initialized RNNModel instance."""
         model = RNNModel(self.sequence_length).to(self.device)
         return model
-    
+
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """
+        Preprocesses data by windowing and normalizing, mirroring the
+        original TensorFlow implementation.
+        """
+        if method == 'train':
+            # Preprocess mains
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+
+            # Preprocess appliances
+            appliance_list = []
+            for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
+                if app_name not in self.appliance_params:
+                    raise ApplianceNotFoundError(f"Parameters for appliance '{app_name}' not found!")
+                
+                app_mean = self.appliance_params[app_name]['mean']
+                app_std = self.appliance_params[app_name]['std']
+
+                processed_app_dfs = []
+                for app_df in app_df_lst:
+                    new_app_readings = app_df.values.reshape((-1, 1))
+                    new_app_readings = (new_app_readings - app_mean) / app_std
+                    processed_app_dfs.append(pd.DataFrame(new_app_readings))
+                appliance_list.append((app_name, processed_app_dfs))
+            return processed_mains_lst, appliance_list
+
+        else: # method == 'test'
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            return processed_mains_lst
 
     def set_appliance_params(self, train_appliances):
-        """Compute normalization statistics (mean, std) for each appliance"""
-        print("Setting appliance parameters...")
-        
-        param_progress = tqdm(train_appliances, desc="Computing appliance stats", unit="appliance")
-        
-        for (app_name, df_list) in param_progress:
-            param_progress.set_postfix({"Current": app_name})
-            
-            # Concatenate all data for this appliance and compute statistics
-            l = np.array(pd.concat(df_list, axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
-            
-            # Prevent division by zero in normalization
+        """Computes and sets normalization parameters for each appliance."""
+        for (app_name, df_list) in train_appliances:
+            values = np.concatenate([df.values for df in df_list])
+            app_mean = np.mean(values)
+            app_std = np.std(values)
             if app_std < 1:
-                app_std = 100
-            self.appliance_params.update({app_name: {'mean': app_mean, 'std': app_std}})
-        
-        print(self.appliance_params)
\ No newline at end of file
+                app_std = 100  # Avoid division by zero for flat signals
+            self.appliance_params[app_name] = {'mean': app_mean, 'std': app_std}
+        _log_print("Appliance parameters set:", self.appliance_params)
\ No newline at end of file
diff --git a/nilmtk_contrib/torch/rnn_attention.py b/nilmtk_contrib/torch/rnn_attention.py
index 53d8b08..9de340d 100644
--- a/nilmtk_contrib/torch/rnn_attention.py
+++ b/nilmtk_contrib/torch/rnn_attention.py
@@ -1,32 +1,20 @@
 from __future__ import print_function, division
-from warnings import warn
 from nilmtk.disaggregate import Disaggregator
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from torch.utils.data import Dataset, DataLoader, TensorDataset
-import os
-import pickle
+from torch.utils.data import DataLoader, TensorDataset
 import pandas as pd
 import numpy as np
 from collections import OrderedDict
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
-from tqdm import tqdm
-import random
-import sys
-from nilmtk_contrib.torch.preprocessing import preprocess
-
-# Set random seeds for reproducibility across runs
-random.seed(10)
-np.random.seed(10)
-torch.manual_seed(10)
-if torch.cuda.is_available():
-    torch.cuda.manual_seed(10)
-    torch.cuda.manual_seed_all(10)
+from nilmtk_contrib.utils.validation import safe_train_test_split as train_test_split
 
 # Use GPU if available, otherwise fall back to CPU
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 class SequenceLengthError(Exception):
@@ -37,318 +25,256 @@ class ApplianceNotFoundError(Exception):
 
 class AttentionLayer(nn.Module):
     """
-    Attention mechanism to focus on relevant parts of the input sequence.
-    Inspired from: https://github.com/antoniosudoso/attention-nilm
+    An attention mechanism that computes a context-aware representation of the input sequence.
+    This implementation is designed to mirror the original TensorFlow version.
     """
     def __init__(self, units):
         super(AttentionLayer, self).__init__()
         self.units = units
-        # Linear layers for attention computation
-        self.W = nn.Linear(512, units)  # 512 = bidirectional LSTM output (256*2)
+        # Linear layers for computing attention scores
+        self.W = nn.Linear(512, units)  # Input is from a bidirectional LSTM (256*2)
         self.V = nn.Linear(units, 1)
         
-        # Initialize weights using He normal initialization
+        # Initialize weights with He normal to match TensorFlow's 'he_normal'
         nn.init.kaiming_normal_(self.W.weight, mode='fan_in', nonlinearity='relu')
         nn.init.kaiming_normal_(self.V.weight, mode='fan_in', nonlinearity='relu')
         nn.init.zeros_(self.W.bias)
         nn.init.zeros_(self.V.bias)
     
     def forward(self, encoder_output):
-        # encoder_output shape: (batch_size, sequence_length, hidden_size)
-        
-        # Compute attention scores
-        score = self.V(torch.tanh(self.W(encoder_output)))  # (batch_size, seq_len, 1)
-        
-        # Convert scores to probabilities
-        attention_weights = F.softmax(score, dim=1)  # (batch_size, seq_len, 1)
-        
-        # Compute weighted context vector
-        context_vector = attention_weights * encoder_output  # (batch_size, seq_len, hidden_size)
-        context_vector = torch.sum(context_vector, dim=1)  # (batch_size, hidden_size)
+        """
+        Args:
+            encoder_output: The output from the LSTM layer, shape (batch, seq_len, hidden_size).
+        Returns:
+            context_vector: The weighted sum of encoder outputs, shape (batch, hidden_size).
+        """
+        # Calculate alignment scores
+        score = self.V(torch.tanh(self.W(encoder_output)))  # (batch, seq_len, 1)
+        
+        # Convert scores to weights using softmax
+        attention_weights = F.softmax(score, dim=1)
+        
+        # Compute the context vector
+        context_vector = attention_weights * encoder_output
+        context_vector = torch.sum(context_vector, dim=1)
         
         return context_vector
 
 class RNNAttentionModel(nn.Module):
     """
-    Neural network combining CNN feature extraction, bidirectional LSTMs, 
-    and attention mechanism for NILM energy disaggregation.
+    An RNN-based model with an attention mechanism for NILM, designed to
+    mirror the original TensorFlow implementation.
     """
     def __init__(self, sequence_length):
         super(RNNAttentionModel, self).__init__()
         self.sequence_length = sequence_length
         
-        # 1D CNN for initial feature extraction from raw power sequence
-        self.conv1d = nn.Conv1d(
-            in_channels=1, 
-            out_channels=16, 
-            kernel_size=4, 
-            stride=1, 
-            padding=2  # Maintain sequence length
-        )
-        
-        # First bidirectional LSTM layer
-        self.lstm1 = nn.LSTM(
-            input_size=16,
-            hidden_size=128,
-            num_layers=1,
-            batch_first=True,
-            bidirectional=True
-        )
-        
-        # Second bidirectional LSTM layer for deeper feature learning
-        self.lstm2 = nn.LSTM(
-            input_size=256,  # 128 * 2 (bidirectional)
-            hidden_size=256,
-            num_layers=1,
-            batch_first=True,
-            bidirectional=True
-        )
-        
-        # Attention mechanism to focus on important time steps
+        # Layers are defined to match the TensorFlow architecture
+        self.conv1d = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=4, 
+                                stride=1, padding=2) # 'same' padding
+        self.lstm1 = nn.LSTM(input_size=16, hidden_size=128, batch_first=True, bidirectional=True)
+        self.lstm2 = nn.LSTM(input_size=256, hidden_size=256, batch_first=True, bidirectional=True)
         self.attention = AttentionLayer(units=128)
+        self.fc1 = nn.Linear(512, 128)
+        self.fc2 = nn.Linear(128, 1)
         
-        # Final fully connected layers for prediction
-        self.fc1 = nn.Linear(512, 128)  # 256 * 2 (bidirectional)
-        self.fc2 = nn.Linear(128, 1)   # Output single power value
-        
-        # Dropout for regularization
-        self.dropout = nn.Dropout(0.1)
+        self._initialize_weights()
+    
+    def _initialize_weights(self):
+        """Initializes weights to match TensorFlow's default initializations."""
+        # Use Xavier uniform for Conv, LSTM, and Linear layers by default
+        for m in self.modules():
+            if isinstance(m, (nn.Conv1d, nn.Linear)):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LSTM):
+                for name, param in m.named_parameters():
+                    if 'weight' in name:
+                        nn.init.xavier_uniform_(param)
+                    elif 'bias' in name:
+                        nn.init.zeros_(param)
     
     def forward(self, x):
-        # Input shape: (batch_size, sequence_length, 1)
-        # Rearrange for Conv1D: (batch_size, channels, sequence_length)
+        # Input shape: (batch, seq_len, 1) -> permute for Conv1D
         x = x.permute(0, 2, 1)
         
-        # Extract features using 1D convolution
-        x = self.conv1d(x)  # (batch_size, 16, sequence_length)
+        # Feature extraction
+        x = self.conv1d(x)
         
-        # Rearrange back for LSTM: (batch_size, sequence_length, features)
+        # Permute for LSTM layers
         x = x.permute(0, 2, 1)
         
-        # Process through bidirectional LSTM layers
-        x, _ = self.lstm1(x)  # (batch_size, sequence_length, 256)
-        x = self.dropout(x)
-        
-        x, _ = self.lstm2(x)  # (batch_size, sequence_length, 512)
-        
-        # Apply attention to get context-aware representation
-        x = self.attention(x)  # (batch_size, 512)
+        # Sequence processing
+        x, _ = self.lstm1(x)
+        x, _ = self.lstm2(x)
         
-        # Final prediction layers
-        x = torch.tanh(self.fc1(x))  # (batch_size, 128)
-        x = self.dropout(x)
-        x = self.fc2(x)  # (batch_size, 1)
+        # Attention and final prediction
+        x = self.attention(x)
+        x = torch.tanh(self.fc1(x))
+        x = self.fc2(x)
         
         return x
 
 class RNN_attention(Disaggregator):
     """
-    NILM disaggregator using RNN with attention mechanism.
-    Inherits from NILMTK's Disaggregator base class.
-    """
+    RNN with attention mechanism for non-intrusive load monitoring.
+    
+    This implementation is based on the paper:
+    "ResNet-based Multi-output Regression for NILM: Towards Enhanced Appliance State Detection"
+    https://arxiv.org/abs/2411.15805v1
+    
+    The model uses bidirectional LSTM layers with attention mechanism for learning 
+    temporal dependencies and focusing on relevant time steps in energy 
+    disaggregation tasks.
     
+    Architecture Overview:
+    - Bidirectional LSTM layers for sequence modeling
+    - Attention mechanism for learning relevant temporal features
+    - Dense layers for final power consumption prediction
+    - Sequence-to-point prediction for energy disaggregation
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Length of input sequences (default: 19)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
+            - appliance_params (dict): Appliance-specific normalization parameters
+    """
     def __init__(self, params):
-        """Initialize the disaggregator with hyperparameters"""
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        """Initializes the disaggregator and its hyperparameters."""
         self.MODEL_NAME = "RNN_attention"
-        self.models = OrderedDict()  # Store separate models for each appliance
+        self.models = OrderedDict()
         
-        # Extract hyperparameters from params dict
         self.chunk_wise_training = params.get('chunk_wise_training', False)
         self.sequence_length = params.get('sequence_length', 19)
         self.n_epochs = params.get('n_epochs', 10)
         self.batch_size = params.get('batch_size', 512)
         self.load_model_path = params.get('load_model_path', None)
-        self.appliance_params = params.get('appliance_params', {})  # Normalization stats
+        self.appliance_params = params.get('appliance_params', {})
         self.mains_mean = params.get('mains_mean', 1800)
         self.mains_std = params.get('mains_std', 600)
         self.device = device
         
-        # Sequence length must be odd for proper windowing
         if self.sequence_length % 2 == 0:
-            print("Sequence length should be odd!")
-            raise SequenceLengthError
+            raise SequenceLengthError("Sequence length must be odd for proper windowing.")
     
     def partial_fit(self, train_main, train_appliances, do_preprocessing=True, **load_kwargs):
-        """Train models on a chunk of data (supports incremental learning)"""
-        
-        # Compute appliance-specific normalization parameters if not provided
-        if len(self.appliance_params) == 0:
+        """Trains the model on a chunk of data."""
+        if not self.appliance_params:
             self.set_appliance_params(train_appliances)
         
-        print("...............RNN_attention partial_fit running...............")
+        _log_print("...............RNN_attention partial_fit running...............")
         
-        # Preprocess data: windowing, normalization, etc.
         if do_preprocessing:
-            print("Preprocessing data...")
-            train_main, train_appliances = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean = self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=train_main,
-                submeters_lst=train_appliances,
-                method="train",
-                appliance_params=self.appliance_params,
-                windowing=False
-            )
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
         
-        # Prepare main power data for training
-        train_main = pd.concat(train_main, axis=0)
-        train_main = train_main.values.reshape((-1, self.sequence_length, 1))
+        # Prepare data for training
+        train_main = pd.concat(train_main, axis=0).values.reshape((-1, self.sequence_length, 1))
         
-        # Prepare appliance power data
         new_train_appliances = []
-        for app_name, app_df in train_appliances:
-            app_df = pd.concat(app_df, axis=0)
-            app_df_values = app_df.values.reshape((-1, 1))
+        for app_name, app_dfs in train_appliances:
+            app_df_values = pd.concat(app_dfs, axis=0).values.reshape((-1, 1))
             new_train_appliances.append((app_name, app_df_values))
         train_appliances = new_train_appliances
         
-        print(f"Training data shape: {train_main.shape}")
-        
-        # Train a separate model for each appliance
-        appliance_progress = tqdm(train_appliances, desc="Training appliances", unit="appliance")
-        
-        for appliance_name, power in appliance_progress:
-            appliance_progress.set_postfix({"Current": appliance_name})
-            
-            # Create new model if this appliance hasn't been seen before
+        # Train a model for each appliance
+        for appliance_name, power in train_appliances:
             if appliance_name not in self.models:
-                print(f"\nFirst model training for {appliance_name}")
+                _log_print(f"First time training for {appliance_name}")
                 self.models[appliance_name] = self.return_network()
             else:
-                print(f"\nStarted Retraining model for {appliance_name}")
+                _log_print(f"Retraining model for {appliance_name}")
             
             model = self.models[appliance_name]
             
-            # Train only if we have sufficient data
-            if train_main.size > 0 and len(train_main) > 10:
-                # Split data into training and validation sets
+            if train_main.size > 10:
+                # Create training and validation sets
                 train_x, v_x, train_y, v_y = train_test_split(
-                    train_main, power, test_size=.15, random_state=10)
+                    train_main, power, test_size=0.15, random_state=10)
                 
-                # Convert to PyTorch tensors and move to device
+                # Convert to PyTorch Tensors
                 train_x = torch.FloatTensor(train_x).to(self.device)
                 v_x = torch.FloatTensor(v_x).to(self.device)
                 train_y = torch.FloatTensor(train_y).to(self.device)
                 v_y = torch.FloatTensor(v_y).to(self.device)
                 
-                # Create PyTorch DataLoaders for batch processing
+                # Create DataLoaders
                 train_dataset = TensorDataset(train_x, train_y)
                 val_dataset = TensorDataset(v_x, v_y)
                 train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
                 val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False)
                 
-                # Train the model
                 self.train_model(model, train_loader, val_loader, appliance_name)
     
     def train_model(self, model, train_loader, val_loader, appliance_name):
-        """Train a single appliance model with early stopping based on validation loss"""
-        optimizer = optim.Adam(model.parameters(), lr=0.001)
+        """Handles the training and validation loop for a single appliance model."""
+        optimizer = optim.Adam(model.parameters())
         criterion = nn.MSELoss()
         
         best_val_loss = float('inf')
         best_model_state = None
         
-        epoch_progress = tqdm(range(self.n_epochs), desc=f"Training {appliance_name}", unit="epoch")
-        
-        for epoch in epoch_progress:
-            # Training phase
+        for epoch in range(self.n_epochs):
+            # --- Training Phase ---
             model.train()
             train_loss = 0.0
             
-            train_batch_progress = tqdm(train_loader, desc=f"Epoch {epoch+1} Training", 
-                                      leave=False, unit="batch")
-            
-            for batch_x, batch_y in train_batch_progress:
+            for batch_x, batch_y in train_loader:
                 optimizer.zero_grad()
-                
                 outputs = model(batch_x)
                 loss = criterion(outputs.squeeze(), batch_y.squeeze())
-                
                 loss.backward()
                 optimizer.step()
-                
                 train_loss += loss.item()
-                train_batch_progress.set_postfix({"Loss": f"{loss.item():.4f}"})
             
-            # Validation phase
+            # --- Validation Phase ---
             model.eval()
             val_loss = 0.0
             
-            val_batch_progress = tqdm(val_loader, desc=f"Epoch {epoch+1} Validation", 
-                                    leave=False, unit="batch")
-            
             with torch.no_grad():
-                for batch_x, batch_y in val_batch_progress:
+                for batch_x, batch_y in val_loader:
                     outputs = model(batch_x)
                     loss = criterion(outputs.squeeze(), batch_y.squeeze())
                     val_loss += loss.item()
-                    val_batch_progress.set_postfix({"Loss": f"{loss.item():.4f}"})
             
-            # Calculate average losses
             train_loss /= len(train_loader)
             val_loss /= len(val_loader)
             
-            epoch_progress.set_postfix({
-                "Train Loss": f"{train_loss:.4f}",
-                "Val Loss": f"{val_loss:.4f}",
-                "Best": f"{best_val_loss:.4f}"
-            })
-            
-            # Save best model based on validation loss
+            # Save the best model based on validation loss
             if val_loss < best_val_loss:
                 best_val_loss = val_loss
                 best_model_state = model.state_dict().copy()
-                epoch_progress.write(f'New best model saved with val_loss: {val_loss:.4f}')
                 
-                # Save model checkpoint
-                filepath = f'RNN_attention-temp-weights-{appliance_name.replace(" ", "_")}-{random.randint(0,100000)}.pth'
+                filepath = checkpoint_path(".pth")
                 torch.save(best_model_state, filepath)
+                _log_print(f'Epoch {epoch+1}: val_loss improved to {val_loss:.6f}, saving model to {filepath}')
         
-        # Load the best model weights
+        # Load the best performing model
         if best_model_state is not None:
             model.load_state_dict(best_model_state)
-            print(f"\nLoaded best model for {appliance_name} with validation loss: {best_val_loss:.4f}")
     
     def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
-        """Disaggregate power consumption for each appliance from aggregate mains data"""
-        
+        """Disaggregates a chunk of mains data."""
         if model is not None:
             self.models = model
         
-        # Preprocess test data similar to training data
         if do_preprocessing:
-            print("Preprocessing test data...")
-            test_main_list = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=test_main_list,
-                submeters_lst=None,
-                method="test",
-                appliance_params=self.appliance_params,
-                windowing=False
-            )
+            test_main_list = self.call_preprocessing(
+                test_main_list, submeters_lst=None, method='test')
         
         test_predictions = []
         
-        chunk_progress = tqdm(test_main_list, desc="Processing test chunks", unit="chunk")
-        
-        # Process each chunk of test data
-        for test_main in chunk_progress:
-            test_main = test_main.values
-            test_main = test_main.reshape((-1, self.sequence_length, 1))
-            test_main_tensor = torch.FloatTensor(test_main).to(self.device)
+        for test_mains_df in test_main_list:
+            test_main_array = test_mains_df.values.reshape((-1, self.sequence_length, 1))
+            test_main_tensor = torch.FloatTensor(test_main_array).to(self.device)
             
             disggregation_dict = {}
             
-            appliance_progress = tqdm(self.models.items(), desc="Disaggregating appliances", 
-                                    leave=False, unit="appliance")
-            
-            # Get predictions from each appliance model
-            for appliance, model in appliance_progress:
-                appliance_progress.set_postfix({"Current": appliance})
-                
+            for appliance, model in self.models.items():
                 model.eval()
                 
                 # Create DataLoader for batched inference
@@ -356,57 +282,86 @@ def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
                 test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)
                 
                 predictions = []
-                
-                pred_progress = tqdm(test_loader, desc=f"Predicting {appliance}", 
-                                   leave=False, unit="batch")
-                
-                # Generate predictions
                 with torch.no_grad():
-                    for batch_x, in pred_progress:
+                    for batch_x, in test_loader:
                         batch_pred = model(batch_x)
                         predictions.append(batch_pred.cpu().numpy())
                 
                 prediction = np.concatenate(predictions, axis=0)
                 
-                # Denormalize predictions back to original power scale
-                prediction = (self.appliance_params[appliance]['mean'] + 
-                            prediction * self.appliance_params[appliance]['std'])
+                # Denormalize predictions
+                app_mean = self.appliance_params[appliance]['mean']
+                app_std = self.appliance_params[appliance]['std']
+                denormalized_prediction = app_mean + (prediction * app_std)
                 
-                # Ensure non-negative power values
-                valid_predictions = prediction.flatten()
-                valid_predictions = np.where(valid_predictions > 0, valid_predictions, 0)
-                df = pd.Series(valid_predictions)
+                # Set negative values to zero
+                denormalized_prediction[denormalized_prediction < 0] = 0
+                df = pd.Series(denormalized_prediction.flatten())
                 disggregation_dict[appliance] = df
             
-            # Combine all appliance predictions for this chunk
             results = pd.DataFrame(disggregation_dict, dtype='float32')
             test_predictions.append(results)
         
         return test_predictions
     
     def return_network(self):
-        """Factory method to create a new RNN_Attention model instance"""
+        """Returns a new, initialized RNNAttentionModel instance."""
         model = RNNAttentionModel(self.sequence_length).to(self.device)
         return model
+    
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """
+        Preprocesses data by windowing and normalizing, mirroring the
+        original TensorFlow implementation.
+        """
+        if method == 'train':
+            # Preprocess mains
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+
+            # Preprocess appliances
+            appliance_list = []
+            for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
+                if app_name not in self.appliance_params:
+                    raise ApplianceNotFoundError(f"Parameters for appliance '{app_name}' not found!")
+                
+                app_mean = self.appliance_params[app_name]['mean']
+                app_std = self.appliance_params[app_name]['std']
+
+                processed_app_dfs = []
+                for app_df in app_df_lst:
+                    new_app_readings = app_df.values.reshape((-1, 1))
+                    new_app_readings = (new_app_readings - app_mean) / app_std
+                    processed_app_dfs.append(pd.DataFrame(new_app_readings))
+                appliance_list.append((app_name, processed_app_dfs))
+            return processed_mains_lst, appliance_list
+
+        else: # method == 'test'
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            return processed_mains_lst
         
     def set_appliance_params(self, train_appliances):
-        """Compute normalization statistics (mean, std) for each appliance"""
-        print("Setting appliance parameters...")
-        
-        param_progress = tqdm(train_appliances, desc="Computing appliance stats", unit="appliance")
-        
-        for (app_name, df_list) in param_progress:
-            param_progress.set_postfix({"Current": app_name})
-            
-            # Concatenate all data for this appliance and compute statistics
-            l = np.array(pd.concat(df_list, axis=0))
-            app_mean = np.mean(l)
-            app_std = np.std(l)
-            
-            # Prevent division by zero in normalization
+        """Computes and sets normalization parameters for each appliance."""
+        for (app_name, df_list) in train_appliances:
+            values = np.concatenate([df.values for df in df_list])
+            app_mean = np.mean(values)
+            app_std = np.std(values)
             if app_std < 1:
-                app_std = 100
-                
-            self.appliance_params.update({app_name: {'mean': app_mean, 'std': app_std}})
-        
-        print(self.appliance_params)
\ No newline at end of file
+                app_std = 100  # Avoid division by zero for flat signals
+            self.appliance_params[app_name] = {'mean': app_mean, 'std': app_std}
+        _log_print("Appliance parameters set:", self.appliance_params)
diff --git a/nilmtk_contrib/torch/rnn_attention_classification.py b/nilmtk_contrib/torch/rnn_attention_classification.py
index 6b70791..6ca0f78 100644
--- a/nilmtk_contrib/torch/rnn_attention_classification.py
+++ b/nilmtk_contrib/torch/rnn_attention_classification.py
@@ -1,310 +1,510 @@
-from __future__ import annotations
-import copy, numpy as np, pandas as pd
-from collections import OrderedDict
-from typing import Dict, Any, List, Tuple
-
+from __future__ import print_function, division
+from nilmtk.disaggregate import Disaggregator
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.utils.data import TensorDataset, DataLoader
-from tqdm import tqdm                                 
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+import pandas as pd
+import numpy as np
+from collections import OrderedDict
+from nilmtk_contrib.utils.validation import safe_train_test_split as train_test_split
+import copy
 
-from nilmtk.disaggregate import Disaggregator
-from nilmtk_contrib.torch.preprocessing import preprocess
+# Set device
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+from nilmtk_contrib.preprocessing.classification import (
+    appliance_threshold,
+    classification_metadata,
+    loss_weight_metadata,
+)
 
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 class SequenceLengthError(Exception):
     pass
 
-
 class ApplianceNotFoundError(Exception):
     pass
 
-
-class IdentityBlock(nn.Module):
-    def __init__(self, ch: int, k: int):
-        super().__init__()
-        self.c1 = nn.Conv1d(ch, ch, k, padding="same")
-        self.c2 = nn.Conv1d(ch, ch, k, padding="same")
-        self.c3 = nn.Conv1d(ch, ch, k, padding="same")
-        self.act = nn.ReLU()
-
-    def forward(self, x):
-        s = x
-        x = self.act(self.c1(x))
-        x = self.act(self.c2(x))
-        x = self.c3(x)
-        return self.act(x + s)
-
-
-class ConvBlock(nn.Module):
-    def __init__(self, ch_in: int, ch_mid: int, ch_out: int, k: int):
-        super().__init__()
-        self.c1 = nn.Conv1d(ch_in,  ch_mid, k, padding="same")
-        self.c2 = nn.Conv1d(ch_mid, ch_mid, k, padding="same")
-        self.c3 = nn.Conv1d(ch_mid, ch_out, k, padding="same")
-        self.proj = nn.Conv1d(ch_in, ch_out, 1)
-        self.act = nn.ReLU()
-
-    def forward(self, x):
-        s = self.proj(x)
-        x = self.act(self.c1(x))
-        x = self.act(self.c2(x))
-        x = self.c3(x)
-        return self.act(x + s)
-
-
 class AttentionLayer(nn.Module):
-    """Additive (Bahdanau) attention over the Bi-LSTM outputs."""
-    def __init__(self, units: int):
-        super().__init__()
-        self.W = nn.Linear(units * 2, units)   # *2 : bidirectional
+    """
+    An attention layer that computes a context vector from encoder outputs.
+    This implementation is designed to mirror the original TensorFlow version.
+    """
+    def __init__(self, units):
+        super(AttentionLayer, self).__init__()
+        # Layers to compute attention scores
+        self.W = nn.Linear(units * 2, units)  # Input is bidirectional, hence *2
         self.V = nn.Linear(units, 1)
-
-    def forward(self, enc_out):               # (B, T, 2H)
-        score = self.V(torch.tanh(self.W(enc_out)))   # (B,T,1)
-        weights = torch.softmax(score, dim=1)         # (B,T,1)
-        ctx = torch.sum(weights * enc_out, dim=1)     # (B,2H)
-        return ctx, weights.squeeze(-1)               # (B,2H), (B,T)
-
-
-class _RNNAttNet(nn.Module):
-    def __init__(self, seq_len: int):
-        super().__init__()
-        self.seq_len = seq_len
-
-        self.cls_feat = nn.Sequential(
-            nn.Conv1d(1, 30, 10), nn.ReLU(),
-            nn.Conv1d(30, 30, 8), nn.ReLU(),
-            nn.Conv1d(30, 40, 6), nn.ReLU(),
-            nn.Conv1d(40, 50, 5), nn.ReLU(),
-            nn.Conv1d(50, 50, 5), nn.ReLU(),
-            nn.Conv1d(50, 50, 5), nn.ReLU(),
-            nn.Flatten(),
-            nn.LazyLinear(1024), nn.ReLU()
-        )
-        self.cls_head = nn.Sequential(
-            nn.Linear(1024, seq_len),
-            nn.Sigmoid()
-        )
-
-        self.conv_reg = nn.Conv1d(1, 16, 4, padding="same")
-        self.bi1 = nn.LSTM(16, 128, batch_first=True, bidirectional=True)
-        self.bi2 = nn.LSTM(256, 256, batch_first=True, bidirectional=True)
-        self.att = AttentionLayer(256)
-        self.reg_dense = nn.Sequential(
-            nn.Linear(512, 128), nn.Tanh(),
-            nn.Linear(128, seq_len)
-        )
-
-    def forward(self, x):                     # x (B,1,L)
-        cls = self.cls_head(self.cls_feat(x))     # (B,L)
-
-        y = self.conv_reg(x).permute(0, 2, 1)     # (B,L,16)
-        y,_ = self.bi1(y)
-        y,_ = self.bi2(y)
-        ctx, att = self.att(y)                    # (B,512)
-        reg = self.reg_dense(ctx)                 # (B,L)
-
-        return reg * cls, cls, att                # masked power, on/off, att
-
+        
+        # Initialize weights with He normal to match TensorFlow's default
+        nn.init.kaiming_normal_(self.W.weight, nonlinearity='relu')
+        nn.init.kaiming_normal_(self.V.weight, nonlinearity='relu')
+        nn.init.zeros_(self.W.bias)
+        nn.init.zeros_(self.V.bias)
+    
+    def forward(self, encoder_output):
+        """
+        Args:
+            encoder_output: The output from the LSTM layer, shape (batch, seq_len, hidden_size*2).
+        Returns:
+            context_vector: The weighted sum of encoder outputs, shape (batch, hidden_size*2).
+            attention_weights: The computed attention weights, shape (batch, seq_len).
+        """
+        # Calculate alignment scores
+        score = self.V(torch.tanh(self.W(encoder_output)))  # (batch, seq_len, 1)
+        
+        # Convert scores to weights using softmax
+        attention_weights = F.softmax(score, dim=1)  # (batch, seq_len, 1)
+        
+        # Compute the context vector
+        context_vector = attention_weights * encoder_output
+        context_vector = torch.sum(context_vector, dim=1)
+        
+        return context_vector, attention_weights.squeeze(-1)
+
+class RNNAttentionClassificationNet(nn.Module):
+    """
+    A dual-subnetwork model for NILM, combining a CNN-based classification
+    network and an RNN-with-attention regression network. The architecture
+    is designed to mirror the original TensorFlow implementation.
+    """
+    def __init__(self, sequence_length):
+        super(RNNAttentionClassificationNet, self).__init__()
+        self.sequence_length = sequence_length
+        
+        # --- CLASSIFICATION SUBNETWORK (CNN) ---
+        self.cls_conv1 = nn.Conv1d(1, 30, kernel_size=10, padding='valid')
+        self.cls_conv2 = nn.Conv1d(30, 30, kernel_size=8, padding='valid')
+        self.cls_conv3 = nn.Conv1d(30, 40, kernel_size=6, padding='valid')
+        self.cls_conv4 = nn.Conv1d(40, 50, kernel_size=5, padding='valid')
+        self.cls_conv5 = nn.Conv1d(50, 50, kernel_size=5, padding='valid')
+        self.cls_conv6 = nn.Conv1d(50, 50, kernel_size=5, padding='valid')
+        
+        # Calculate the flattened size dynamically after convolutions
+        self._calculate_cls_flatten_size(sequence_length)
+        
+        self.cls_dense1 = nn.Linear(self.cls_flatten_size, 1024)
+        self.cls_dense2 = nn.Linear(1024, sequence_length)
+        
+        # --- REGRESSION SUBNETWORK (RNN with Attention) ---
+        self.reg_conv = nn.Conv1d(1, 16, kernel_size=4, stride=1, padding='same')
+        self.bi_lstm1 = nn.LSTM(16, 128, batch_first=True, bidirectional=True)
+        self.bi_lstm2 = nn.LSTM(256, 256, batch_first=True, bidirectional=True)
+        self.attention = AttentionLayer(256)
+        self.reg_dense1 = nn.Linear(512, 128)  # 512 = 256 * 2 (bidirectional)
+        self.reg_dense2 = nn.Linear(128, sequence_length)
+        
+        self._initialize_weights()
+
+    def _calculate_cls_flatten_size(self, seq_len):
+        """Calculates the input size for the classification FC layer."""
+        # Each conv layer reduces length by (kernel_size - 1)
+        conv_output_length = seq_len - (10-1) - (8-1) - (6-1) - (5-1) - (5-1) - (5-1)
+        self.cls_flatten_size = 50 * conv_output_length
+    
+    def _initialize_weights(self):
+        """Initializes weights to match TensorFlow's default initializations."""
+        for m in self.modules():
+            if isinstance(m, (nn.Conv1d, nn.Linear)):
+                # Use Xavier uniform for Conv and Linear layers by default
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LSTM):
+                # Initialize LSTM weights and biases
+                for name, param in m.named_parameters():
+                    if 'weight' in name:
+                        nn.init.xavier_uniform_(param)
+                    elif 'bias' in name:
+                        nn.init.zeros_(param)
+    
+    def forward(self, x):
+        """
+        Performs the forward pass, combining classification and regression outputs.
+        
+        Args:
+            x: Input tensor of shape (batch_size, 1, sequence_length).
+        Returns:
+            output: The final disaggregated power, shape (batch, seq_len).
+            classification_output: The appliance status prediction, shape (batch, seq_len).
+            attention_weights: The attention weights from the regression subnetwork, shape (batch, seq_len).
+        """
+        # --- CLASSIFICATION SUBNETWORK ---
+        cls_x = F.relu(self.cls_conv1(x))
+        cls_x = F.relu(self.cls_conv2(cls_x))
+        cls_x = F.relu(self.cls_conv3(cls_x))
+        cls_x = F.relu(self.cls_conv4(cls_x))
+        cls_x = F.relu(self.cls_conv5(cls_x))
+        cls_x = F.relu(self.cls_conv6(cls_x))
+        cls_x = cls_x.flatten(1)
+        cls_x = F.relu(self.cls_dense1(cls_x))
+        classification_output = torch.sigmoid(self.cls_dense2(cls_x))
+        
+        # --- REGRESSION SUBNETWORK ---
+        reg_x = self.reg_conv(x).permute(0, 2, 1)  # (batch, seq_len, 16)
+        reg_x, _ = self.bi_lstm1(reg_x)
+        reg_x, _ = self.bi_lstm2(reg_x)
+        context_vector, attention_weights = self.attention(reg_x)
+        reg_x = torch.tanh(self.reg_dense1(context_vector))
+        regression_output = self.reg_dense2(reg_x)
+        
+        # Final output is the element-wise product of the two subnetworks
+        output = regression_output * classification_output
+        
+        return output, classification_output, attention_weights
 
 class RNN_attention_classification(Disaggregator):
     """
-    RNN-based disaggregator with attention mechanism for classification.
-    This model uses a combination of convolutional layers, LSTM layers,
-    and attention mechanisms to disaggregate mains electricity data into
-    appliance-level data.
+    RNN with attention and classification for non-intrusive load monitoring.
+    
+    This implementation is based on the paper:
+    "ResNet-based Multi-output Regression for NILM: Towards Enhanced Appliance State Detection"
+    https://arxiv.org/abs/2411.15805v1
+    
+    The model combines RNN with attention mechanism and CNN-based classification for 
+    enhanced appliance state detection and power consumption prediction in energy 
+    disaggregation tasks.
+    
+    Architecture Overview:
+    - Classification subnetwork with 1D convolutions for appliance state detection
+    - Regression subnetwork with bidirectional LSTM and attention mechanism
+    - Attention layer for learning relevant temporal features
+    - Element-wise multiplication of classification and regression outputs
+    - Multi-output learning for enhanced appliance state detection
+    
+    Parameters:
+        params (dict): Configuration parameters including:
+            - sequence_length (int): Length of input sequences (default: 99)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
+            - appliance_params (dict): Appliance-specific normalization parameters
+            - mains_params (dict): Mains-specific normalization parameters
     """
-    def __init__(self, params: Dict[str, Any]):
-        super().__init__()
+    def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
         self.MODEL_NAME = "RNN_attention_classification"
-        self.chunk_wise_training = params.get("chunk_wise_training", True)
-        self.sequence_length = params.get("sequence_length", 99)
+        self.chunk_wise_training = params.get('chunk_wise_training', False)
+        self.sequence_length = params.get('sequence_length', 99)
+        self.n_epochs = params.get('n_epochs', 10)
+        self.models = OrderedDict()
+        self.att_models = OrderedDict()  # Store attention models separately like TensorFlow
+        self.mains_mean = 1800
+        self.mains_std = 600
+        self.batch_size = params.get('batch_size', 512)
+        self.appliance_params = params.get('appliance_params', {})
+        self.mains_params = params.get('mains_params', {})
+        self.device = device
+        self.classification_threshold = params.get('classification_threshold', params.get('on_power_threshold', 15))
+        self.regression_loss_weight = params.get('regression_loss_weight', 1.0)
+        self.classification_loss_weight = params.get('classification_loss_weight', 1.0)
+        self.classification_metadata = classification_metadata(
+            self.appliance_params,
+            self.classification_threshold,
+        )
+        self.loss_weight_metadata = loss_weight_metadata(
+            self.regression_loss_weight,
+            self.classification_loss_weight,
+        )
+        
         if self.sequence_length % 2 == 0:
-            raise SequenceLengthError("Sequence length must be odd")
-
-        self.n_epochs   = params.get("n_epochs", 10)
-        self.batch_size = params.get("batch_size", 512)
-
-        self.appliance_params: Dict[str, Dict[str, float]] = {}
-        self.mains_mean, self.mains_std = 1800, 600
-
-        self.models: "OrderedDict[str,_RNNAttNet]" = OrderedDict()
-        self.best: Dict[str, float] = {}
-
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    def _fresh_network(self):
-        return _RNNAttNet(self.sequence_length).to(self.device)
-    
-    def set_mains_params(self, mains_list):
-        data = np.concatenate([m.values.flatten() for m in mains_list])
-        self.mains_mean = data.mean()
-        self.mains_std  = max(data.std(), 1.0)
-
-    def set_appliance_params(self, train_apps):
-        for app, dfs in train_apps:
-            data = np.concatenate([d.values.flatten() for d in dfs])
-            self.appliance_params[app] = {
-                "mean": data.mean(),
-                "std" : max(data.std(), 1.0),
-                "min" : data.min(),
-                "max" : data.max()
+            raise SequenceLengthError("Sequence length must be odd!")
+
+    def return_network(self):
+        """Returns a new model and a corresponding attention model wrapper."""
+        model = RNNAttentionClassificationNet(self.sequence_length).to(self.device)
+        
+        # Wrapper to extract attention weights, for compatibility with TF version
+        class AttentionWrapper(nn.Module):
+            def __init__(self, full_model):
+                super().__init__()
+                self.full_model = full_model
+            
+            def forward(self, x):
+                _, _, attention_weights = self.full_model(x)
+                return attention_weights
+        
+        attention_model = AttentionWrapper(model).to(self.device)
+        return model, attention_model
+
+    def classify(self, classify_appliance):
+        """
+        Generates binary on/off classification targets from appliance data.
+        This preprocessing mirrors the original TensorFlow implementation.
+        """
+        appliance_on_off = []
+
+        for app_index, (appliance_name, on_off_list) in enumerate(classify_appliance):
+            threshold = appliance_threshold(
+                self.appliance_params,
+                appliance_name,
+                self.classification_threshold,
+            )
+            classification_appliance_dfs = []
+            for appliance in on_off_list:
+                n = self.sequence_length
+                units_to_pad = n // 2
+                
+                # Apply thresholding
+                appliance_copy = appliance.copy()
+                appliance_copy[appliance_copy <= threshold] = 0
+                appliance_copy[appliance_copy > threshold] = 1
+                
+                # Create sequences
+                new_app_readings = appliance_copy.values.flatten()
+                new_app_readings = np.pad(new_app_readings, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_app_readings = np.array([new_app_readings[i:i + n] for i in range(len(new_app_readings) - n + 1)])
+                classification_appliance_dfs.append(pd.DataFrame(new_app_readings))
+                
+            appliance_on_off.append((appliance_name, classification_appliance_dfs))
+        return appliance_on_off
+
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """
+        Preprocesses data by windowing and normalizing, mirroring the
+        original TensorFlow implementation.
+        """
+        if method == 'train':
+            # Preprocess mains
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+
+            # Preprocess appliances
+            appliance_list = []
+            for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
+                if app_name in self.appliance_params:
+                    self.appliance_params[app_name]['mean']
+                    self.appliance_params[app_name]['std']
+                    app_min = self.appliance_params[app_name]['min']
+                    app_max = self.appliance_params[app_name]['max']
+                else:
+                    raise ApplianceNotFoundError(f"Parameters for appliance '{app_name}' not found!")
+
+                processed_app_dfs = []
+                for app_df in app_df_lst:
+                    new_app_readings = app_df.values.flatten()
+                    new_app_readings = np.pad(new_app_readings, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                    new_app_readings = np.array([new_app_readings[i:i + n] for i in range(len(new_app_readings) - n + 1)])
+                    # Normalize with min-max scaling, matching TensorFlow
+                    new_app_readings = (new_app_readings - app_min) / (app_max - app_min)
+                    processed_app_dfs.append(pd.DataFrame(new_app_readings))
+
+                appliance_list.append((app_name, processed_app_dfs))
+
+            return processed_mains_lst, appliance_list
+
+        else: # method == 'test'
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                new_mains = new_mains.reshape((-1, self.sequence_length))
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            return processed_mains_lst
+
+    def set_mains_params(self, train_main):
+        """Computes and sets normalization parameters for the mains data."""
+        all_mains_data = np.concatenate([mains.values.flatten() for mains in train_main])
+        self.mains_params = {
+            'mean': np.mean(all_mains_data),
+            'std': np.std(all_mains_data),
+            'min': np.min(all_mains_data),
+            'max': np.max(all_mains_data)
+        }
+
+    def set_appliance_params(self, train_appliances):
+        """Computes and sets normalization parameters for each appliance."""
+        for (app_name, df_list) in train_appliances:
+            app_data = np.concatenate([df.values for df in df_list])
+            app_mean = np.mean(app_data)
+            app_std = np.std(app_data)
+            if app_std < 1:
+                app_std = 100  # Avoid division by zero for flat signals
+            self.appliance_params[app_name] = {
+                'mean': app_mean,
+                'std': app_std,
+                'min': np.min(app_data),
+                'max': np.max(app_data)
             }
 
-    def classify(self, apps, threshold: float = 15.0):
-        L, pad = self.sequence_length, self.sequence_length // 2
-        out = []
-        for app, dfs in apps:
-            proc = []
-            for df in dfs:
-                v = df.values.flatten()  # Flatten the DataFrame to 1D array
-                v[v <= threshold] = 0
-                v[v >  threshold] = 1
-                v = np.pad(v, (pad, pad))
-                w = np.array([v[i:i+L] for i in range(len(v)-L+1)], np.float32)  # Overlapping windows
-                proc.append(pd.DataFrame(w))
-            out.append((app, proc))
-        return out
-
-    def partial_fit(self, mains, apps, do_preprocessing=True, **_):
-
+    def partial_fit(self, train_main, train_appliances, do_preprocessing=True, **load_kwargs):
+        """Trains the model on a chunk of data."""
+        _log_print("...............RNN_attention_classification partial_fit running...............")
+        
         if not self.appliance_params:
-            self.set_appliance_params(apps)
-        self.set_mains_params(mains)
+            self.set_appliance_params(train_appliances)
+        if not self.mains_params:
+            self.set_mains_params(train_main)
 
         if do_preprocessing:
-            cls_targets = self.classify(copy.deepcopy(apps))
-            mains, apps = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=mains,
-                submeters_lst=apps,
-                method="train",
-                appliance_params=self.appliance_params,
-                windowing=False
-            )
-
-        X = torch.tensor(pd.concat(mains).values,
-                         dtype=torch.float32).unsqueeze(1)   # (N,1,L)
-        N = X.size(0)  # Number of samples
-        perm = torch.randperm(N)
-        split = int(0.15 * N)
-        val_idx, tr_idx = perm[:split], perm[split:]
-        X_tr, X_val = X[tr_idx].to(self.device), X[val_idx].to(self.device)
-
-        y_reg, y_cls = {}, {}
-        for app, dfs in apps:
-            y_reg[app] = torch.tensor(pd.concat(dfs).values, dtype=torch.float32)
-        for app, dfs in cls_targets:
-            y_cls[app] = torch.tensor(pd.concat(dfs).values, dtype=torch.float32)
-
-        mse, bce = nn.MSELoss(), nn.BCELoss()
-
-        for app in y_reg:
-            y_tr = y_reg[app][tr_idx].to(self.device)
-            y_val = y_reg[app][val_idx].to(self.device)
-            c_tr = y_cls[app][tr_idx].to(self.device)
-            c_val = y_cls[app][val_idx].to(self.device)
-
-            if app not in self.models:
-                self.models[app] = self._fresh_network()
-                self.best[app] = np.inf
-
-            net = self.models[app]
-            optim = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
-
-            loader = DataLoader(
-                TensorDataset(X_tr, y_tr, c_tr),
-                batch_size=self.batch_size, shuffle=True
-            )
-
-            # Training loop
-            for ep in range(self.n_epochs):
-                net.train()
-                run_loss = 0.0
-                bar = tqdm(loader,
-                           desc=f"{app} ▏epoch {ep+1}/{self.n_epochs}",
-                           leave=False, unit="batch")
-                for xb, yb, cb in bar:
-                    optim.zero_grad()
-                    pr, pc, _ = net(xb)
-                    loss = mse(pr, yb) + bce(pc, cb)
-                    loss.backward()
-                    optim.step()
-                    run_loss += loss.item()
-                    bar.set_postfix(loss=f"{loss.item():.4f}")
-
-                avg_loss = run_loss / len(loader)
-
-                # Validation
-                net.eval()
-                with torch.no_grad():
-                    vr, vc, _ = net(X_val)  
-                    v_loss = mse(vr, y_val).item() + bce(vc, c_val).item()
-
-                tqdm.write(
-                    f"[{app}] Epoch {ep+1}/{self.n_epochs} | "
-                    f"Train Loss: {avg_loss:.4f} | Val Loss: {v_loss:.4f}"
-                )
-
-                if v_loss < self.best[app]:
-                    self.best[app] = v_loss
-                    torch.save(net.state_dict(), f"rnn_att-{app}.pth")
-
-            net.load_state_dict(torch.load(f"rnn_att-{app}.pth",
-                                           map_location=self.device))
-
-    def disaggregate_chunk(self, mains, model=None, do_preprocessing=True):
+            # Create classification targets before normalizing appliance data
+            classify_appliance = copy.deepcopy(train_appliances)
+            classification = self.classify(classify_appliance)
+            
+            # Normalize mains and appliance data
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+        
+        # Reshape all data into sequences
+        train_main = pd.concat(train_main, axis=0).values.reshape((-1, self.sequence_length, 1))
+
+        # Process appliance power data
+        new_train_appliances = []
+        for app_name, app_dfs in train_appliances:
+            app_df_values = pd.concat(app_dfs, axis=0).values.reshape((-1, self.sequence_length))
+            new_train_appliances.append((app_name, app_df_values))
+        train_appliances = new_train_appliances
+
+        # Process classification target data
+        new_train_appliances_classification = {}
+        for app_name, app_dfs in classification:
+            app_df_values = pd.concat(app_dfs, axis=0).values.reshape((-1, self.sequence_length))
+            new_train_appliances_classification[app_name] = app_df_values
+        
+        self.att_models = {}
+        for appliance_name, power in train_appliances:
+            if appliance_name not in self.models:
+                _log_print(f"First time training for {appliance_name}")
+                self.models[appliance_name], self.att_models[appliance_name] = self.return_network()
+            else:
+                _log_print(f"Retraining model for {appliance_name}")
+
+            model = self.models[appliance_name]
+            if train_main.size > 10:
+                    # Combine power and classification targets for splitting
+                    power_classification_target = np.concatenate(
+                        (power, new_train_appliances_classification[appliance_name]), axis=1)
+
+                    # Create training and validation sets
+                    train_x, v_x, train_y_combined, v_y_combined = train_test_split(
+                        train_main, power_classification_target, test_size=0.15, random_state=10)
+
+                    # Separate power and classification targets after splitting
+                    train_y = train_y_combined[:, :self.sequence_length]
+                    v_y = v_y_combined[:, :self.sequence_length]
+                    train_c = train_y_combined[:, self.sequence_length:]
+                    v_c = v_y_combined[:, self.sequence_length:]
+
+                    # Convert to PyTorch Tensors
+                    train_x = torch.tensor(train_x, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+                    v_x = torch.tensor(v_x, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+                    train_y = torch.tensor(train_y, dtype=torch.float32).to(self.device)
+                    v_y = torch.tensor(v_y, dtype=torch.float32).to(self.device)
+                    train_c = torch.tensor(train_c, dtype=torch.float32).to(self.device)
+                    v_c = torch.tensor(v_c, dtype=torch.float32).to(self.device)
+
+                    # Optimizer and loss functions, matching TensorFlow
+                    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+                    mse_loss = nn.MSELoss()
+                    bce_loss = nn.BCELoss()
+
+                    best_val_loss = float('inf')
+                    filepath = checkpoint_path(".pth")
+
+                    # Training loop
+                    for epoch in range(self.n_epochs):
+                        model.train()
+                        train_dataset = TensorDataset(train_x, train_y, train_c)
+                        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
+
+                        epoch_losses = []
+                        for batch_x, batch_y, batch_c in train_loader:
+                            optimizer.zero_grad()
+                            output, classification_output, _ = model(batch_x)
+                            
+                            # Combined loss (regression + classification)
+                            loss = (
+                                self.regression_loss_weight * mse_loss(output, batch_y)
+                                + self.classification_loss_weight * bce_loss(classification_output, batch_c)
+                            )
+                            
+                            loss.backward()
+                            optimizer.step()
+                            epoch_losses.append(loss.item())
+
+                        # Validation
+                        model.eval()
+                        with torch.no_grad():
+                            val_output, val_classification, _ = model(v_x)
+                            val_loss = (
+                                self.regression_loss_weight * mse_loss(val_output, v_y)
+                                + self.classification_loss_weight * bce_loss(val_classification, v_c)
+                            )
+
+                        avg_train_loss = np.mean(epoch_losses)
+                        _log_print(f"Epoch {epoch+1}/{self.n_epochs} - loss: {avg_train_loss:.4f} - val_loss: {val_loss:.4f}")
+
+                        # Save the best model based on validation loss
+                        if val_loss < best_val_loss:
+                            best_val_loss = val_loss
+                            torch.save(model.state_dict(), filepath)
+                            _log_print(f"Validation loss improved, saving model to {filepath}")
+
+                    # Load the best performing model
+                    model.load_state_dict(torch.load(filepath, map_location=self.device))
+
+    def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
+        """Disaggregates a chunk of mains data."""
         if model is not None:
             self.models = model
-        if do_preprocessing:
-            mains = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=mains,
-                submeters_lst=None,
-                method="test",
-                appliance_params=self.appliance_params,
-                windowing=False
-            )
 
-        L = self.sequence_length
-        out = []
-        for m in mains:
-            X = torch.tensor(m.values, dtype=torch.float32
-                            ).unsqueeze(1).to(self.device)
-            disc = {}
-            for app, net in self.models.items():
-                net.eval()
+        if do_preprocessing:
+            test_main_list = self.call_preprocessing(
+                test_main_list, submeters_lst=None, method='test')
+
+        test_predictions = []
+        for test_mains_df in test_main_list:
+            disggregation_dict = {}
+            test_main_array = test_mains_df.values.reshape((-1, self.sequence_length, 1))
+            test_main_tensor = torch.tensor(test_main_array, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+
+            for appliance in self.models:
+                model = self.models[appliance]
+                model.eval()
+                
                 with torch.no_grad():
-                    pr, _, _ = net(X)  
-                    pr = pr.cpu().numpy()
-
-                # overlap-mean
-                def ov(a):
-                    s, c = np.zeros(len(a)+L-1), np.zeros(len(a)+L-1)  # sums, counts
-                    for i,row in enumerate(a):
-                        s[i:i+L] += row
-                        c[i:i+L] += 1
-                    return s/c
-
-                power = ov(pr)
-                p = self.appliance_params[app]
-                power = np.clip(p["min"] + power*(p["max"]-p["min"]), 0, None)
-                disc[app] = pd.Series(power, dtype="float32")
-            out.append(pd.DataFrame(disc, dtype="float32"))
-        return out
-
-    # NILMTK shortcut wrappers
-    def train(self, mains, apps, **kw):
-        return self.partial_fit(mains, apps, **kw)
-
-    def disaggregate(self, mains, store):
-        preds = self.disaggregate_chunk(mains)
-        for i, df in enumerate(preds):
-            for col in df.columns:
-                store.put(f"/building1/elec/meter{i+1}/{col}", df[col])
+                    prediction_output, _, _ = model(test_main_tensor)
+                    prediction_output = prediction_output.cpu().numpy()
+                
+                # Average predictions over overlapping windows to get a single series
+                window_length = self.sequence_length
+                n = len(prediction_output) + window_length - 1
+                sum_arr = np.zeros(n)
+                counts_arr = np.zeros(n)
+                
+                for i, p in enumerate(prediction_output):
+                    sum_arr[i:i+window_length] += p.flatten()
+                    counts_arr[i:i+window_length] += 1
+                
+                # Avoid division by zero
+                counts_arr[counts_arr == 0] = 1
+                averaged_prediction = sum_arr / counts_arr
+
+                # Denormalize the prediction
+                app_min = self.appliance_params[appliance]['min']
+                app_max = self.appliance_params[appliance]['max']
+                denormalized_prediction = app_min + (averaged_prediction * (app_max - app_min))
+                
+                # Set negative values to zero
+                denormalized_prediction[denormalized_prediction < 0] = 0
+                df = pd.Series(denormalized_prediction)
+                disggregation_dict[appliance] = df
+
+            results = pd.DataFrame(disggregation_dict, dtype='float32')
+            test_predictions.append(results)
+
+        return test_predictions
diff --git a/nilmtk_contrib/torch/seq2point.py b/nilmtk_contrib/torch/seq2point.py
index ee5ee89..e53db66 100644
--- a/nilmtk_contrib/torch/seq2point.py
+++ b/nilmtk_contrib/torch/seq2point.py
@@ -1,235 +1,301 @@
 from collections import OrderedDict
-import os
 import numpy as np
 import pandas as pd
 import torch
 import torch.nn as nn
 from torch.utils.data import TensorDataset, DataLoader
-from tqdm import tqdm
 from nilmtk.disaggregate import Disaggregator
-from nilmtk_contrib.torch.preprocessing import preprocess
 
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
 class SequenceLengthError(Exception):
     pass
 
-
 class ApplianceNotFoundError(Exception):
     pass
 
-
 class Seq2PointTorch(Disaggregator):
     """
-    Sequence-to-Point NILM disaggregator using PyTorch.
-    Uses 1D CNN to map power sequences to single appliance power values.
+    Sequence-to-Point neural network for Non-Intrusive Load Monitoring (NILM).
+    
+    Based on "Sequence-to-Point Learning With Neural Networks for Non-Intrusive Load Monitoring"
+    by Zhang et al., published in Proceedings of the AAAI Conference on Artificial Intelligence, 2018.
+    DOI: https://doi.org/10.1609/aaai.v32i1.11873
+    
+    This model uses a sequence-to-point learning approach where the input is a window 
+    of mains power consumption and the output is a single point prediction of the target 
+    appliance power. The architecture uses convolutional neural networks that can inherently 
+    learn appliance signatures to reduce the identifiability problem in energy disaggregation.
+    
+    Architecture Overview:
+    - Multiple 1D convolutional layers for feature extraction from power sequences
+    - Dropout layer for regularization
+    - Fully connected layers for final power prediction
+    - Single point output from sequence input (sequence-to-point learning)
+    
+    Args:
+        params (dict): Dictionary containing model hyperparameters:
+            - sequence_length (int): Length of input sequences (default: 99, must be odd)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - appliance_params (dict): Appliance-specific normalization parameters
+            - mains_mean (float): Mean normalization for mains power (default: 1800)
+            - mains_std (float): Standard deviation for mains power (default: 600)
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
     """
     def __init__(self, params):
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        """Initializes the disaggregator and its hyperparameters."""
         super().__init__()
         self.MODEL_NAME = "Seq2PointTorch"
-        self.models = OrderedDict()  # Store separate models for each appliance
+        self.models = OrderedDict()
         self.file_prefix = f"{self.MODEL_NAME.lower()}-temp-weights"
         
-        # Extract hyperparameters from params dict
         self.chunk_wise_training = params.get("chunk_wise_training", False)
         self.sequence_length = params.get("sequence_length", 99)
         self.n_epochs = params.get("n_epochs", 10)
         self.batch_size = params.get("batch_size", 512)
-        self.appliance_params = params.get("appliance_params", {})  # Normalization stats
+        self.appliance_params = params.get("appliance_params", {})
         self.mains_mean = params.get("mains_mean", 1800)
         self.mains_std = params.get("mains_std", 600)
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-        # Sequence length must be odd for proper windowing
+        
         if self.sequence_length % 2 == 0:
-            raise SequenceLengthError("Sequence length should be odd!")
+            raise SequenceLengthError("Sequence length must be odd for proper windowing.")
 
-    def _build_network(self):
-        """Build the 1D CNN network architecture for sequence-to-point mapping"""
-        seq_len = self.sequence_length
-        # Calculate reduction in sequence length after all conv layers
-        conv_reduction = (10-1) + (8-1) + (6-1) + (5-1) + (5-1)  # = 29
-        
-        model = nn.Sequential(
-            # Feature extraction layers with 1D convolutions
-            nn.Conv1d(1, 30, kernel_size=10, stride=1), nn.ReLU(),
-            nn.Conv1d(30, 30, kernel_size=8, stride=1), nn.ReLU(),
-            nn.Conv1d(30, 40, kernel_size=6, stride=1), nn.ReLU(),
-            nn.Conv1d(40, 50, kernel_size=5, stride=1), nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Conv1d(50, 50, kernel_size=5, stride=1), nn.ReLU(),
-            nn.Dropout(0.2),
+    def return_network(self):
+        """Builds the 1D CNN model, mirroring the original TensorFlow architecture."""
+        class Seq2PointNet(nn.Module):
+            """The Seq2Point neural network architecture."""
+            def __init__(self, sequence_length):
+                super().__init__()
+                # Layer definitions to match the original TensorFlow model
+                self.conv1 = nn.Conv1d(1, 30, kernel_size=10, stride=1)
+                self.conv2 = nn.Conv1d(30, 30, kernel_size=8, stride=1)
+                self.conv3 = nn.Conv1d(30, 40, kernel_size=6, stride=1)
+                self.conv4 = nn.Conv1d(40, 50, kernel_size=5, stride=1)
+                self.conv5 = nn.Conv1d(50, 50, kernel_size=5, stride=1)
+                self.dropout = nn.Dropout(0.2)
+                
+                # Calculate the flattened size dynamically after convolutions
+                self._calculate_flatten_size(sequence_length)
+                
+                self.fc1 = nn.Linear(self.flatten_size, 1024)
+                self.fc2 = nn.Linear(1024, 1)
+                
+                self._initialize_weights()
+
+            def _calculate_flatten_size(self, seq_len):
+                """Calculates the input size for the fully connected layer."""
+                # Each conv layer reduces length by (kernel_size - 1)
+                conv_output_length = seq_len - (10-1) - (8-1) - (6-1) - (5-1) - (5-1)
+                self.flatten_size = 50 * conv_output_length
             
-            # Flatten for fully connected layers
-            nn.Flatten(),
+            def _initialize_weights(self):
+                """Initializes weights to match TensorFlow's default (glorot_uniform)."""
+                for m in self.modules():
+                    if isinstance(m, (nn.Conv1d, nn.Linear)):
+                        nn.init.xavier_uniform_(m.weight)
+                        if m.bias is not None:
+                            nn.init.zeros_(m.bias)
             
-            # Dense layers for final prediction
-            nn.Linear(50 * (seq_len - conv_reduction), 1024), nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Linear(1024, 1)  # Output single power value
-        )
-        return model.to(self.device)
-
-    def partial_fit(self, train_main, train_appliances, do_preprocessing=True,
-                    current_epoch=0, **load_kwargs):
-        """Train models on a chunk of data (supports incremental learning)"""
+            def forward(self, x):
+                # Forward pass through the network
+                x = torch.relu(self.conv1(x))
+                x = torch.relu(self.conv2(x))
+                x = torch.relu(self.conv3(x))
+                x = torch.relu(self.conv4(x))
+                x = self.dropout(x)
+                x = torch.relu(self.conv5(x))
+                x = self.dropout(x)
+                x = x.flatten(1) # Flatten the output for the dense layers
+                x = torch.relu(self.fc1(x))
+                x = self.dropout(x)
+                x = self.fc2(x)
+                return x
         
-        # Compute appliance-specific normalization parameters if not provided
+        model = Seq2PointNet(self.sequence_length).to(self.device)
+        return model
+
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """
+        Preprocesses data by windowing and normalizing, mirroring the
+        original TensorFlow implementation.
+        """
+        if method == 'train':
+            # Preprocess mains
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+
+            # Preprocess appliances
+            appliance_list = []
+            for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
+                if app_name not in self.appliance_params:
+                    raise ApplianceNotFoundError(f"Parameters for appliance '{app_name}' not found!")
+                
+                app_mean = self.appliance_params[app_name]['mean']
+                app_std = self.appliance_params[app_name]['std']
+
+                processed_app_dfs = []
+                for app_df in app_df_lst:
+                    new_app_readings = app_df.values.reshape((-1, 1))
+                    new_app_readings = (new_app_readings - app_mean) / app_std
+                    processed_app_dfs.append(pd.DataFrame(new_app_readings))
+                appliance_list.append((app_name, processed_app_dfs))
+            return processed_mains_lst, appliance_list
+        
+        else: # method == 'test'
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            return processed_mains_lst
+
+    def set_appliance_params(self, train_appliances):
+        """Computes and sets normalization parameters for each appliance."""
+        for app_name, df_list in train_appliances:
+            values = np.concatenate([df.values for df in df_list])
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            if app_std < 1:
+                app_std = 100 # Avoid division by zero for flat signals
+            self.appliance_params[app_name] = {'mean': app_mean, 'std': app_std}
+        _log_print("Appliance parameters set:", self.appliance_params)
+
+    def partial_fit(self, train_main, train_appliances, do_preprocessing=True, current_epoch=0, **load_kwargs):
+        """Trains the model on a chunk of data."""
         if not self.appliance_params:
             self.set_appliance_params(train_appliances)
 
-        # Preprocess data: windowing, normalization, etc.
+        _log_print("...............Seq2Point partial_fit running...............")
+        
         if do_preprocessing:
-            train_main, train_appliances = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=train_main,
-                submeters_lst=train_appliances,
-                method="train",
-                appliance_params=self.appliance_params,
-                windowing=False
-            )
-
-        # Prepare main power data for CNN input (batch_size, channels, sequence_length)
-        train_main = pd.concat(train_main, axis=0).values.reshape(
-            -1, self.sequence_length, 1
-        )
-        train_main = torch.tensor(train_main, dtype=torch.float32).permute(0, 2, 1)
-
-        # Prepare appliance power data
-        new_train_apps = []
-        for app_name, app_df_list in train_appliances:
-            app_df = pd.concat(app_df_list, axis=0).values.reshape(-1, 1)
-            new_train_apps.append(
-                (app_name, torch.tensor(app_df, dtype=torch.float32))
-            )
-        train_appliances = new_train_apps
-
-        # Split data into training and validation sets
-        n_total = train_main.size(0)
-        val_split = int(0.15 * n_total)
-        idx = torch.randperm(n_total)
-        tr_idx, val_idx = idx[val_split:], idx[:val_split]
-
-        mains_train = train_main[tr_idx].to(self.device)
-        mains_val = train_main[val_idx].to(self.device)
-
-        # Train a separate model for each appliance
-        for appliance, power_tensor in train_appliances:
-            power_tensor = power_tensor.to(self.device)
-            power_train = power_tensor[tr_idx]
-            power_val = power_tensor[val_idx]
-
-            # Create new model if this appliance hasn't been seen before
-            if appliance not in self.models:
-                print("First model training for", appliance)
-                self.models[appliance] = self._build_network()
-            else:
-                print("Started Retraining model for", appliance)
-
-            model = self.models[appliance]
-            optimiser = torch.optim.Adam(model.parameters())
-            loss_fn = nn.MSELoss()
-
-            best_val = np.inf
-            best_file = f"{self.file_prefix}-{appliance.replace(' ', '_')}-epoch{current_epoch}.pth"
-
-            # Create DataLoader for batch processing
-            dataset = TensorDataset(mains_train, power_train)
-            loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
-
-            # Training loop
-            for epoch in range(self.n_epochs):
-                model.train()
-                epoch_losses = []
-
-                # Training phase
-                for x_batch, y_batch in loader:
-                    x_batch, y_batch = x_batch.to(self.device), y_batch.to(self.device)
-                    optimiser.zero_grad()
-                    preds = model(x_batch).squeeze(1)
-                    loss = loss_fn(preds, y_batch)
-                    loss.backward()
-                    optimiser.step()
-                    epoch_losses.append(loss.item())
-
-                # Validation phase
-                model.eval()
-                with torch.no_grad():
-                    val_preds = model(mains_val).squeeze(1)
-                    val_loss = loss_fn(val_preds, power_val).item()
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
 
-                avg_loss = np.mean(epoch_losses)
-                tqdm.write(f"[{appliance}] Epoch {epoch+1}/{self.n_epochs} | Train Loss: {avg_loss:.4f} | Val Loss: {val_loss:.4f}")
+        # Prepare data for training
+        train_main = pd.concat(train_main, axis=0).values.reshape((-1, self.sequence_length, 1))
+        
+        new_train_appliances = []
+        for app_name, app_dfs in train_appliances:
+            app_df_values = pd.concat(app_dfs, axis=0).values.reshape((-1, 1))
+            new_train_appliances.append((app_name, app_df_values))
+        train_appliances = new_train_appliances
 
-                # Save best model based on validation loss
-                if val_loss < best_val:
-                    best_val = val_loss
-                    torch.save(model.state_dict(), best_file)
+        for appliance_name, power in train_appliances:
+            if appliance_name not in self.models:
+                _log_print(f"First time training for {appliance_name}")
+                self.models[appliance_name] = self.return_network()
+            else:
+                _log_print(f"Retraining model for {appliance_name}")
 
-            # Load the best model weights
-            model.load_state_dict(torch.load(best_file, map_location=self.device))
+            model = self.models[appliance_name]
+            if train_main.size > 10:
+                    # PyTorch Conv1d expects (batch, channels, length)
+                    train_main_tensor = torch.tensor(train_main, dtype=torch.float32).permute(0, 2, 1).to(self.device)
+                    power_tensor = torch.tensor(power, dtype=torch.float32).squeeze().to(self.device)
+                    
+                    # Create validation split
+                    n_samples = train_main_tensor.size(0)
+                    val_size = max(1, int(0.15 * n_samples)) if n_samples > 1 else 0
+                    indices = torch.randperm(n_samples)
+                    train_idx, val_idx = indices[val_size:], indices[:val_size]
+                    
+                    train_X = train_main_tensor[train_idx]
+                    train_y = power_tensor[train_idx]
+                    val_X = train_main_tensor[val_idx]
+                    val_y = power_tensor[val_idx]
+                    
+                    # Optimizer and loss function
+                    optimizer = torch.optim.Adam(model.parameters())
+                    criterion = nn.MSELoss()
+                    
+                    best_val_loss = float('inf')
+                    filepath = checkpoint_path(".pth")
+                    
+                    # Training loop
+                    for epoch in range(self.n_epochs):
+                        model.train()
+                        
+                        train_dataset = TensorDataset(train_X, train_y)
+                        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
+                        
+                        epoch_losses = []
+                        for batch_X, batch_y in train_loader:
+                            optimizer.zero_grad()
+                            predictions = model(batch_X).squeeze()
+                            loss = criterion(predictions, batch_y)
+                            loss.backward()
+                            
+                            # Gradient clipping for stability
+                            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                            
+                            optimizer.step()
+                            epoch_losses.append(loss.item())
+                        
+                        # Validation
+                        model.eval()
+                        with torch.no_grad():
+                            val_predictions = model(val_X).squeeze()
+                            val_loss = criterion(val_predictions, val_y).item()
+                        
+                        avg_train_loss = np.mean(epoch_losses)
+                        _log_print(f"Epoch {epoch+1}/{self.n_epochs} - loss: {avg_train_loss:.4f} - val_loss: {val_loss:.4f}")
+                        
+                        # Save the best model based on validation loss
+                        if val_loss < best_val_loss:
+                            best_val_loss = val_loss
+                            torch.save(model.state_dict(), filepath)
+                            _log_print(f"Validation loss improved, saving model to {filepath}")
+                    
+                    # Load the best performing model
+                    model.load_state_dict(torch.load(filepath, map_location=self.device))
 
     def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
-        """Disaggregate power consumption for each appliance from aggregate mains data"""
-        
+        """Disaggregates a chunk of mains data."""
         if model is not None:
             self.models = model
 
-        # Preprocess test data similar to training data
         if do_preprocessing:
-            test_main_list = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=test_main_list,
-                submeters_lst=None,
-                method="test",
-                appliance_params=self.appliance_params,
-                windowing=False
-            )
-
-        results = []
-        
-        # Process each chunk of test data
-        for mains_df in test_main_list:
-            # Prepare data for CNN input (batch_size, channels, sequence_length)
-            mains_np = mains_df.values.reshape(-1, self.sequence_length, 1)
-            mains_tensor = (
-                torch.tensor(mains_np, dtype=torch.float32)
-                .permute(0, 2, 1)
-                .to(self.device)
-            )
-
-            disagg = {}
-            
-            # Get predictions from each appliance model
-            for appliance, net in self.models.items():
-                net.eval()
-                with torch.no_grad():
-                    # Generate predictions and denormalize back to original power scale
-                    preds = (
-                        net(mains_tensor).cpu().numpy().flatten()
-                        * self.appliance_params[appliance]["std"]
-                        + self.appliance_params[appliance]["mean"]
-                    )
-                    # Ensure non-negative power values
-                    preds = np.clip(preds, 0, None)
-                    disagg[appliance] = pd.Series(preds, dtype="float32")
-
-            # Combine all appliance predictions for this chunk
-            results.append(pd.DataFrame(disagg, dtype="float32"))
-        return results
+            test_main_list = self.call_preprocessing(test_main_list, submeters_lst=None, method='test')
 
-    def set_appliance_params(self, train_appliances):
-        """Compute normalization statistics (mean, std) for each appliance"""
-        for app_name, df_list in train_appliances:
-            # Concatenate all data for this appliance and compute statistics
-            data = np.concatenate([df.values.flatten() for df in df_list])
-            mean, std = data.mean(), data.std()
+        test_predictions = []
+        for test_mains_df in test_main_list:
+            test_main_array = test_mains_df.values.reshape((-1, self.sequence_length, 1))
             
-            # Prevent division by zero in normalization
-            if std < 1:
-                std = 100
-            self.appliance_params[app_name] = {"mean": mean, "std": std}
+            # PyTorch Conv1d expects (batch, channels, length)
+            test_main_tensor = torch.tensor(test_main_array, dtype=torch.float32).permute(0, 2, 1).to(self.device)
             
-        print(self.appliance_params)
\ No newline at end of file
+            disggregation_dict = {}
+            for appliance, model in self.models.items():
+                model.eval()
+                with torch.no_grad():
+                    prediction = model(test_main_tensor).cpu().numpy()
+                    
+                    # Denormalize the prediction
+                    app_mean = self.appliance_params[appliance]['mean']
+                    app_std = self.appliance_params[appliance]['std']
+                    denormalized_prediction = app_mean + (prediction * app_std)
+                    
+                    # Set negative values to zero
+                    denormalized_prediction[denormalized_prediction < 0] = 0
+                    df = pd.Series(denormalized_prediction.flatten())
+                    disggregation_dict[appliance] = df
+                    
+            results = pd.DataFrame(disggregation_dict, dtype='float32')
+            test_predictions.append(results)
+        return test_predictions
\ No newline at end of file
diff --git a/nilmtk_contrib/torch/seq2seq.py b/nilmtk_contrib/torch/seq2seq.py
index d9c1a6f..9213e8c 100644
--- a/nilmtk_contrib/torch/seq2seq.py
+++ b/nilmtk_contrib/torch/seq2seq.py
@@ -1,50 +1,74 @@
-import os, json, numpy as np, pandas as pd
-import torch, torch.nn as nn, torch.optim as optim
-from tqdm import tqdm
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
 from collections import OrderedDict
 from torch.utils.data import TensorDataset, DataLoader
 from nilmtk.disaggregate import Disaggregator
-from nilmtk_contrib.torch.preprocessing import preprocess
+
+from nilmtk_contrib.utils.model import initialize_runtime, legacy_print, module_logger, checkpoint_path
+
+logger = module_logger(__name__)
+_log_print = legacy_print(logger)
+class SequenceLengthError(Exception):
+    pass
+
+class ApplianceNotFoundError(Exception):
+    pass
 
 class Seq2SeqModel(nn.Module):
     """
-    Sequence-to-Sequence CNN model that maps input power sequences 
-    to output appliance power sequences of the same length.
+    A Sequence-to-Sequence (Seq2Seq) CNN model for NILM, with an architecture
+    designed to mirror the original TensorFlow implementation.
     """
-    def __init__(self, seq_len):
+    def __init__(self, sequence_length):
         super().__init__()
+        self.sequence_length = sequence_length
+        
+        # --- Encoder Layers ---
+        self.conv1 = nn.Conv1d(1, 30, kernel_size=10, stride=2, padding=0)
+        self.conv2 = nn.Conv1d(30, 30, kernel_size=8, stride=2, padding=0)
+        self.conv3 = nn.Conv1d(30, 40, kernel_size=6, stride=1, padding=0)
+        self.conv4 = nn.Conv1d(40, 50, kernel_size=5, stride=1, padding=0)
+        self.dropout1 = nn.Dropout(0.2)
+        self.conv5 = nn.Conv1d(50, 50, kernel_size=5, stride=1, padding=0)
+        self.dropout2 = nn.Dropout(0.2)
+
+        # Calculate the flattened size dynamically after convolutions
+        self._calculate_flatten_size(sequence_length)
 
-        self.seq_len = seq_len
+        # --- Decoder Layers ---
+        self.flatten = nn.Flatten()
+        self.fc1 = nn.Linear(self.flat_size, 1024)
+        self.dropout3 = nn.Dropout(0.2)
+        self.fc2 = nn.Linear(1024, sequence_length)
         
-        # Encoder: 1D CNN layers with different strides for feature extraction
-        self.conv1 = nn.Conv1d(1, 30, 10, stride=2)
-        self.conv2 = nn.Conv1d(30,30, 8,  stride=2)
-        self.conv3 = nn.Conv1d(30,40, 6,  stride=1)
-        self.conv4 = nn.Conv1d(40,50, 5,  stride=1)
-        self.dropout1 = nn.Dropout(.2)
-        self.conv5 = nn.Conv1d(50,50, 5, stride=1)
-        self.dropout2 = nn.Dropout(.2)
-
-        # Calculate the flattened size after all convolutions
+        self._init_weights()
+
+    def _calculate_flatten_size(self, seq_len):
+        """Calculates the input size for the decoder's fully connected layer."""
+        # Simulate the sequence length reduction through the encoder
         L = seq_len
-        L = (L - 10)//2 + 1
-        L = (L - 8)//2 + 1
+        L = (L - 10) // 2 + 1
+        L = (L - 8) // 2 + 1
         L = L - 6 + 1
         L = L - 5 + 1
         L = L - 5 + 1
-        flat_size = 50 * L
-
-        # Decoder: Fully connected layers to reconstruct sequence
-        self.flatten  = nn.Flatten()
-        self.fc1      = nn.Linear(flat_size, 1024)
-        self.dropout3 = nn.Dropout(.2)
-        self.fc2      = nn.Linear(1024, seq_len)  # Output same length as input
+        self.flat_size = 50 * L
+    
+    def _init_weights(self):
+        """Initializes weights to match TensorFlow's default (glorot_uniform)."""
+        for m in self.modules():
+            if isinstance(m, (nn.Conv1d, nn.Linear)):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
 
     def forward(self, x):
-        # Input: [B, seq_len, 1] → rearrange for Conv1d: [B, 1, seq_len]
-        x = x.permute(0,2,1)
+        # Input shape: (batch, seq_len, 1) -> permute for Conv1D
+        x = x.permute(0, 2, 1)
         
-        # Encoder: feature extraction through conv layers
+        # --- Encoder ---
         x = torch.relu(self.conv1(x))
         x = torch.relu(self.conv2(x))
         x = torch.relu(self.conv3(x))
@@ -53,189 +77,259 @@ def forward(self, x):
         x = torch.relu(self.conv5(x))
         x = self.dropout2(x)
         
-        # Decoder: reconstruct to original sequence length
+        # --- Decoder ---
         x = self.flatten(x)
         x = torch.relu(self.fc1(x))
         x = self.dropout3(x)
-        x = self.fc2(x)           # [B, seq_len]
+        x = self.fc2(x) # Linear activation
         return x
 
 class Seq2Seq(Disaggregator):
     """
-    NILM disaggregator using sequence-to-sequence learning.
-    Maps input power sequences to appliance power sequences of the same length.
+    Sequence-to-Sequence CNN for Non-Intrusive Load Monitoring (NILM).
+    
+    Based on the foundational sequence-to-sequence learning approach from:
+    "Sequence to Sequence Learning with Neural Networks" by Sutskever et al.
+    https://arxiv.org/abs/1409.3215
+    
+    This implementation adapts the sequence-to-sequence paradigm for energy disaggregation,
+    using a CNN-based encoder-decoder architecture instead of the original LSTM approach.
+    The model learns to map input sequences of aggregate power consumption to output 
+    sequences of individual appliance power consumption.
+    
+    Architecture Overview:
+    - Encoder: Multiple 1D convolutional layers with decreasing stride for feature extraction
+    - Decoder: Fully connected layers that reconstruct the sequence from encoded features
+    - Dropout layers for regularization throughout the network
+    - Sequence-to-sequence learning for temporal power disaggregation
+    
+    Args:
+        params (dict): Dictionary containing model hyperparameters:
+            - sequence_length (int): Length of input/output sequences (default: 99, must be odd)
+            - n_epochs (int): Number of training epochs (default: 10)
+            - batch_size (int): Training batch size (default: 512)
+            - appliance_params (dict): Appliance-specific normalization parameters
+            - chunk_wise_training (bool): Enable chunk-wise training (default: False)
     """
     def __init__(self, params):
-        super().__init__()
-
+        initialize_runtime(self, params, backends=("python", "numpy", "torch"))
+        """Initializes the disaggregator and its hyperparameters."""
         self.MODEL_NAME = "Seq2Seq"
         self.file_prefix = f"{self.MODEL_NAME.lower()}-temp-weights"
+        self.chunk_wise_training = params.get('chunk_wise_training', False)
+        self.sequence_length = params.get('sequence_length', 99)
+        self.n_epochs = params.get('n_epochs', 10)
+        self.models = OrderedDict()
+        self.mains_mean = 1800
+        self.mains_std = 600
+        self.batch_size = params.get('batch_size', 512)
+        self.appliance_params = params.get('appliance_params', {})
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         
-        # Extract hyperparameters
-        self.sequence_length     = params.get('sequence_length', 99)
         if self.sequence_length % 2 == 0:
-            raise ValueError("sequence_length must be odd")
-        self.n_epochs            = params.get('n_epochs', 10)
-        self.batch_size          = params.get('batch_size', 512)
-        self.mains_mean          = 1800
-        self.mains_std           = 600
-        self.appliance_params    = params.get('appliance_params', {})  # Normalization stats
-        self.models              = OrderedDict()  # Store separate models for each appliance
-        self.device              = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            raise SequenceLengthError("Sequence length must be odd!")
 
     def return_network(self):
-        """Factory method to create a new Seq2Seq model instance"""
+        """Returns a new, initialized Seq2SeqModel instance."""
         return Seq2SeqModel(self.sequence_length).to(self.device)
 
     def set_appliance_params(self, train_appliances):
-        """Compute normalization statistics (mean, std) for each appliance"""
-        for name, lst in train_appliances:
-            arr = pd.concat(lst, axis=0).values.flatten()
-            m, s = arr.mean(), arr.std()
-            # Prevent division by zero in normalization
-            if s < 1: s = 100
-            self.appliance_params[name] = {'mean':m, 'std':s}
-
-    def partial_fit(self, train_main, train_appliances,
-                    do_preprocessing=True, current_epoch=0, **_):
-        """Train models on a chunk of data (supports incremental learning)"""
-        
-        # Compute appliance-specific normalization parameters if not provided
+        """Computes and sets normalization parameters for each appliance."""
+        for (app_name, df_list) in train_appliances:
+            values = np.concatenate([df.values for df in df_list])
+            app_mean = np.mean(values)
+            app_std = np.std(values)
+            if app_std < 1:
+                app_std = 100 # Avoid division by zero for flat signals
+            self.appliance_params[app_name] = {'mean': app_mean, 'std': app_std}
+
+    def partial_fit(self, train_main, train_appliances, do_preprocessing=True, current_epoch=0, **load_kwargs):
+        """Trains the model on a chunk of data."""
+        _log_print("...............Seq2Seq partial_fit running...............")
         if not self.appliance_params:
             self.set_appliance_params(train_appliances)
 
-        # Preprocess data: windowing, normalization, etc.
         if do_preprocessing:
-            train_main, train_appliances = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=train_main,
-                submeters_lst=train_appliances,
-                method="train",
-                appliance_params=self.appliance_params,
-                windowing=True
-            )
-
-        # Prepare main power data for training
-        mains_arr = pd.concat(train_main,axis=0).values \
-                     .reshape(-1, self.sequence_length, 1)
-
-        # Train a separate model for each appliance
-        for name, dfs in train_appliances:
-            # Prepare appliance power sequences (targets)
-            arr = pd.concat(dfs,axis=0).values \
-                    .reshape(-1, self.sequence_length)
-            
-            # Create new model if this appliance hasn't been seen before
-            if name not in self.models:
-                self.models[name] = self.return_network()
-            model = self.models[name]
-
-            # Convert to tensors
-            X = torch.tensor(mains_arr, dtype=torch.float32)
-            Y = torch.tensor(arr,       dtype=torch.float32)
-            
-            # Split into training and validation sets
-            split = int(0.85*len(X))
-
-            tr_ds = TensorDataset(X[:split], Y[:split])
-            va_ds = TensorDataset(X[split:], Y[split:])
-            tr = DataLoader(tr_ds, batch_size=self.batch_size, shuffle=True)
-            va = DataLoader(va_ds, batch_size=self.batch_size)
-
-            # Setup training components
-            opt     = optim.Adam(model.parameters())
-            loss_fn = nn.MSELoss()
-            best    = float('inf')
-            ckpt    = f"{self.file_prefix}-{name}-epoch{current_epoch}.pt"
-
-            # Training loop
-            for epoch in tqdm(range(self.n_epochs), desc=f"Train {name}"):
-                # Training phase
-                model.train()
-                for xb, yb in tr:
-                    xb, yb = xb.to(self.device), yb.to(self.device)
-                    opt.zero_grad()
-                    out = model(xb)                   # [B, seq_len]
-                    loss_fn(out, yb).backward()
-                    opt.step()
-
-                # Validation phase
-                model.eval()
-                val_losses = []
-                with torch.no_grad():
-                    for xb, yb in va:
-                        xb, yb = xb.to(self.device), yb.to(self.device)
-                        val_losses.append(loss_fn(model(xb), yb).item())
-                val_loss = sum(val_losses)/len(val_losses)
-                
-                # Save best model based on validation loss
-                if val_loss < best:
-                    best = val_loss
-                    torch.save(model.state_dict(), ckpt)
+            train_main, train_appliances = self.call_preprocessing(
+                train_main, train_appliances, 'train')
+
+        # Prepare data for training
+        train_main = pd.concat(train_main, axis=0).values.reshape((-1, self.sequence_length, 1))
+        
+        new_train_appliances = []
+        for app_name, app_dfs in train_appliances:
+            app_df_values = pd.concat(app_dfs, axis=0).values.reshape((-1, self.sequence_length))
+            new_train_appliances.append((app_name, app_df_values))
+        train_appliances = new_train_appliances
+
+        for appliance_name, power in train_appliances:
+            if appliance_name not in self.models:
+                _log_print(f"First time training for {appliance_name}")
+                self.models[appliance_name] = self.return_network()
+            else:
+                _log_print(f"Retraining model for {appliance_name}")
 
-            # Load the best model weights
-            model.load_state_dict(torch.load(ckpt, map_location=self.device))
+            model = self.models[appliance_name]
+            if train_main.size > 10:
+                    filepath = checkpoint_path(".pt")
+                    
+                    # Convert to PyTorch Tensors
+                    train_main_tensor = torch.tensor(train_main, dtype=torch.float32)
+                    power_tensor = torch.tensor(power, dtype=torch.float32)
+                    
+                    # Use the last 15% of data for validation to mirror TensorFlow's behavior
+                    n_total = len(train_main_tensor)
+                    val_size = max(1, int(0.15 * n_total)) if n_total > 1 else 0
+                    
+                    train_x = train_main_tensor[:-val_size].to(self.device)
+                    val_x = train_main_tensor[-val_size:].to(self.device)
+                    train_y = power_tensor[:-val_size].to(self.device)
+                    val_y = power_tensor[-val_size:].to(self.device)
+                    
+                    # Optimizer and loss function, with parameters matching TensorFlow
+                    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-07)
+                    criterion = nn.MSELoss()
+                    
+                    best_val_loss = float('inf')
+                    
+                    # Create DataLoader for batching
+                    train_dataset = TensorDataset(train_x, train_y)
+                    train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
+                    
+                    for epoch in range(self.n_epochs):
+                        # --- Training Phase ---
+                        model.train()
+                        train_loss = 0.0
+                        
+                        for batch_x, batch_y in train_loader:
+                            optimizer.zero_grad()
+                            outputs = model(batch_x)
+                            loss = criterion(outputs, batch_y)
+                            loss.backward()
+                            optimizer.step()
+                            train_loss += loss.item()
+                        
+                        train_loss /= len(train_loader)
+                        
+                        # --- Validation Phase ---
+                        model.eval()
+                        with torch.no_grad():
+                            val_outputs = model(val_x)
+                            val_loss = criterion(val_outputs, val_y).item()
+                        
+                        # Save the best model based on validation loss
+                        if val_loss < best_val_loss:
+                            best_val_loss = val_loss
+                            torch.save(model.state_dict(), filepath)
+                            _log_print(f'Epoch {epoch+1}/{self.n_epochs} - loss: {train_loss:.4f} - val_loss: {val_loss:.4f}')
+                        
+                    # Load the best performing model
+                    model.load_state_dict(torch.load(filepath))
 
     def disaggregate_chunk(self, test_main_list, model=None, do_preprocessing=True):
-        """Disaggregate power consumption using overlapping windows and averaging"""
-        
-        if model: self.models = model
-        
-        # Preprocess test data similar to training data
+        """Disaggregates a chunk of mains data."""
+        if model is not None:
+            self.models = model
+
         if do_preprocessing:
-            test_main_list = preprocess(
-                sequence_length=self.sequence_length,
-                mains_mean=self.mains_mean,
-                mains_std=self.mains_std,
-                mains_lst=test_main_list,
-                submeters_lst=None,
-                method="test",
-                appliance_params=self.appliance_params,
-                windowing=True
-            )
-
-        results = []
-        n = self.sequence_length
-        
-        # Process each chunk of test data
-        for tm in test_main_list:
-            arr = tm.values.reshape(-1, n)
-            ds  = DataLoader(TensorDataset(torch.tensor(arr, dtype=torch.float32)),
-                             batch_size=self.batch_size)
-            outd = {}
-            
-            # Get predictions from each appliance model
-            for name, m in self.models.items():
-                preds = []
-                m.eval()
+            test_main_list = self.call_preprocessing(
+                test_main_list, submeters_lst=None, method='test')
+
+        test_predictions = []
+        for test_mains_df in test_main_list:
+            disggregation_dict = {}
+            test_main_array = test_mains_df.values.reshape((-1, self.sequence_length, 1))
+
+            for appliance, model in self.models.items():
+                test_tensor = torch.tensor(test_main_array, dtype=torch.float32).to(self.device)
+                
+                model.eval()
                 with torch.no_grad():
-                    for (xb_cpu,) in ds:
-                        # Unsqueeze back to [B, seq_len, 1] for model input
-                        xb = xb_cpu.unsqueeze(-1).to(self.device)
-                        p  = m(xb).cpu().numpy()    # [B, seq_len]
-                        preds.append(p)
+                    # Process in batches to manage memory
+                    predictions = []
+                    for i in range(0, len(test_tensor), self.batch_size):
+                        batch = test_tensor[i:i + self.batch_size]
+                        batch_pred = model(batch).cpu().numpy()
+                        predictions.append(batch_pred)
+                    prediction = np.concatenate(predictions, axis=0)
+
+                # Average predictions over overlapping windows
+                window_length = self.sequence_length
+                n = len(prediction) + window_length - 1
+                sum_arr = np.zeros(n)
+                counts_arr = np.zeros(n)
                 
-                # Concatenate all predictions
-                P = np.concatenate(preds, axis=0)
+                for i, p in enumerate(prediction):
+                    sum_arr[i:i+window_length] += p.flatten()
+                    counts_arr[i:i+window_length] += 1
                 
-                # Reconstruct full sequence by averaging overlapping windows
-                total = P.shape[0] + n - 1
-                sum_arr    = np.zeros(total)
-                counts_arr = np.zeros(total)
-                for i in range(P.shape[0]):
-                    sum_arr[i:i+n]    += P[i]
-                    counts_arr[i:i+n] += 1
-                avg = sum_arr/counts_arr
+                # Avoid division by zero
+                counts_arr[counts_arr == 0] = 1
+                averaged_prediction = sum_arr / counts_arr
+
+                # Denormalize the prediction
+                app_mean = self.appliance_params[appliance]['mean']
+                app_std = self.appliance_params[appliance]['std']
+                denormalized_prediction = app_mean + (averaged_prediction * app_std)
                 
-                # Denormalize predictions back to original power scale
-                mpar = self.appliance_params[name]
-                out  = mpar['mean'] + avg * mpar['std']
+                # Set negative values to zero
+                denormalized_prediction[denormalized_prediction < 0] = 0
+                df = pd.Series(denormalized_prediction)
+                disggregation_dict[appliance] = df
                 
-                # Ensure non-negative power values
-                outd[name] = pd.Series(np.clip(out, 0, None))
+            results = pd.DataFrame(disggregation_dict, dtype='float32')
+            test_predictions.append(results)
+
+        return test_predictions
+
+    def call_preprocessing(self, mains_lst, submeters_lst, method):
+        """
+        Preprocesses data by windowing and normalizing, mirroring the
+        original TensorFlow implementation.
+        """
+        if method == 'train':            
+            # Preprocess mains
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                units_to_pad = n // 2
+                new_mains = np.pad(new_mains, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+
+            # Preprocess appliances
+            appliance_list = []
+            for app_index, (app_name, app_df_lst) in enumerate(submeters_lst):
+                if app_name not in self.appliance_params:
+                    raise ApplianceNotFoundError(f"Parameters for appliance '{app_name}' not found!")
                 
-            # Combine all appliance predictions for this chunk
-            results.append(pd.DataFrame(outd, dtype='float32'))
-        return results
\ No newline at end of file
+                app_mean = self.appliance_params[app_name]['mean']
+                app_std = self.appliance_params[app_name]['std']
+
+                processed_app_dfs = []
+                for app_df in app_df_lst:                    
+                    new_app_readings = app_df.values.flatten()
+                    new_app_readings = np.pad(new_app_readings, (units_to_pad, units_to_pad), 'constant', constant_values=(0, 0))
+                    new_app_readings = np.array([new_app_readings[i:i + n] for i in range(len(new_app_readings) - n + 1)])                    
+                    new_app_readings = (new_app_readings - app_mean) / app_std
+                    processed_app_dfs.append(pd.DataFrame(new_app_readings))
+                    
+                appliance_list.append((app_name, processed_app_dfs))
+
+            return processed_mains_lst, appliance_list
+
+        else: # method == 'test'
+            processed_mains_lst = []
+            for mains in mains_lst:
+                new_mains = mains.values.flatten()
+                n = self.sequence_length
+                # The original TF implementation did not pad test data, so we omit it here.
+                # units_to_pad = n // 2
+                # new_mains = np.pad(new_mains, (units_to_pad,units_to_pad),'constant',constant_values = (0,0))
+                new_mains = np.array([new_mains[i:i + n] for i in range(len(new_mains) - n + 1)])
+                new_mains = (new_mains - self.mains_mean) / self.mains_std
+                new_mains = new_mains.reshape((-1, self.sequence_length))
+                processed_mains_lst.append(pd.DataFrame(new_mains))
+            return processed_mains_lst
\ No newline at end of file
diff --git a/nilmtk_contrib/utils/__init__.py b/nilmtk_contrib/utils/__init__.py
new file mode 100644
index 0000000..11e3e8a
--- /dev/null
+++ b/nilmtk_contrib/utils/__init__.py
@@ -0,0 +1,2 @@
+"""Shared utility helpers for nilmtk-contrib."""
+
diff --git a/nilmtk_contrib/utils/checkpoints.py b/nilmtk_contrib/utils/checkpoints.py
new file mode 100644
index 0000000..1ae8d77
--- /dev/null
+++ b/nilmtk_contrib/utils/checkpoints.py
@@ -0,0 +1,181 @@
+"""Checkpoint and persistence helpers."""
+
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import datetime, timezone
+import atexit
+import importlib.metadata
+import inspect
+import json
+from pathlib import Path
+import tempfile
+
+
+METADATA_FILENAME = "metadata.json"
+SCHEMA_VERSION = 1
+_MANAGED_TEMP_DIRS = []
+
+
+@dataclass(frozen=True)
+class ModelMetadata:
+    schema_version: int
+    model_class: str
+    backend: str
+    sequence_length: int
+    appliance_params: dict
+    mains_mean: float
+    mains_std: float
+    created_at: str
+    dependencies: dict
+
+
+@contextmanager
+def temporary_checkpoint(suffix):
+    """Create a temporary checkpoint path that is removed on context exit."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        yield Path(tmpdir) / f"checkpoint{suffix}"
+
+
+def managed_checkpoint_path(suffix):
+    """Return a process-managed temporary checkpoint path."""
+    temp_dir = tempfile.TemporaryDirectory()
+    _MANAGED_TEMP_DIRS.append(temp_dir)
+    return Path(temp_dir.name) / f"checkpoint{suffix}"
+
+
+def _cleanup_managed_temp_dirs():
+    for temp_dir in _MANAGED_TEMP_DIRS:
+        temp_dir.cleanup()
+
+
+atexit.register(_cleanup_managed_temp_dirs)
+
+
+def collect_dependencies(packages):
+    """Return installed package versions for persistence metadata."""
+    dependencies = {}
+    for package in packages:
+        try:
+            dependencies[package] = importlib.metadata.version(package)
+        except importlib.metadata.PackageNotFoundError:
+            dependencies[package] = None
+    return dependencies
+
+
+def _json_safe(value):
+    if isinstance(value, dict):
+        return {key: _json_safe(item) for key, item in value.items()}
+    if isinstance(value, list):
+        return [_json_safe(item) for item in value]
+    if isinstance(value, tuple):
+        return [_json_safe(item) for item in value]
+    if hasattr(value, "item"):
+        try:
+            return value.item()
+        except ValueError:
+            pass
+    return value
+
+
+def build_metadata(
+    *,
+    model_class,
+    backend,
+    sequence_length,
+    appliance_params,
+    mains_mean,
+    mains_std,
+    dependencies=None,
+):
+    """Build serializable model metadata."""
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "model_class": model_class,
+        "backend": backend,
+        "sequence_length": sequence_length,
+        "appliance_params": _json_safe(appliance_params),
+        "mains_mean": _json_safe(mains_mean),
+        "mains_std": _json_safe(mains_std),
+        "created_at": datetime.now(timezone.utc).isoformat(),
+        "dependencies": dependencies or {},
+    }
+
+
+def save_metadata(path, metadata):
+    """Write metadata JSON to a directory."""
+    folder = Path(path)
+    folder.mkdir(parents=True, exist_ok=True)
+    with (folder / METADATA_FILENAME).open("w", encoding="utf-8") as handle:
+        json.dump(metadata, handle, indent=2, sort_keys=True)
+
+
+def load_metadata(path, *, expected_model_class=None, expected_backend=None):
+    """Load and validate persistence metadata."""
+    metadata_path = Path(path) / METADATA_FILENAME
+    with metadata_path.open(encoding="utf-8") as handle:
+        metadata = json.load(handle)
+
+    required_fields = {
+        "schema_version",
+        "model_class",
+        "backend",
+        "sequence_length",
+        "appliance_params",
+        "mains_mean",
+        "mains_std",
+        "created_at",
+        "dependencies",
+    }
+    missing = required_fields.difference(metadata)
+    if missing:
+        missing_list = ", ".join(sorted(missing))
+        raise ValueError(f"Missing metadata fields: {missing_list}.")
+    if metadata["schema_version"] != SCHEMA_VERSION:
+        raise ValueError(
+            f"Unsupported metadata schema_version {metadata['schema_version']}."
+        )
+    if expected_model_class and metadata["model_class"] != expected_model_class:
+        raise ValueError(
+            f"Expected model_class {expected_model_class!r}, "
+            f"got {metadata['model_class']!r}."
+        )
+    if expected_backend and metadata["backend"] != expected_backend:
+        raise ValueError(
+            f"Expected backend {expected_backend!r}, got {metadata['backend']!r}."
+        )
+    return metadata
+
+
+def save_torch_state(model, path):
+    """Save a PyTorch state dict."""
+    import torch
+
+    torch.save(model.state_dict(), path)
+
+
+def load_torch_state(model, path, device, weights_only=True):
+    """Load a PyTorch state dict, using weights_only where supported."""
+    import torch
+
+    load_kwargs = {"map_location": device}
+    if "weights_only" in inspect.signature(torch.load).parameters:
+        load_kwargs["weights_only"] = weights_only
+    state = torch.load(path, **load_kwargs)
+    model.load_state_dict(state)
+    return model
+
+
+def save_keras_weights(model, path):
+    """Save Keras model weights."""
+    model.save_weights(path)
+
+
+def load_keras_weights(model, path):
+    """Load Keras model weights."""
+    model.load_weights(path)
+    return model
+
+
+def unsupported_persistence(model_name):
+    """Raise a standard unsupported persistence error."""
+    raise NotImplementedError(f"{model_name} does not implement model persistence.")
diff --git a/nilmtk_contrib/utils/logging.py b/nilmtk_contrib/utils/logging.py
new file mode 100644
index 0000000..5060eee
--- /dev/null
+++ b/nilmtk_contrib/utils/logging.py
@@ -0,0 +1,24 @@
+"""Logging helpers."""
+
+import logging
+
+
+def get_logger(name):
+    """Return a package logger without configuring global logging."""
+    return logging.getLogger(name)
+
+
+def log_print(logger, *args, **kwargs):
+    """Compatibility replacement for legacy print calls."""
+    if kwargs.get("file") is not None:
+        return
+    sep = kwargs.get("sep", " ")
+    message = sep.join(str(arg) for arg in args)
+    logger.info(message)
+
+
+def configure_logging(verbose=False):
+    """Configure basic logging for scripts or notebooks that opt in."""
+    level = logging.INFO if verbose else logging.WARNING
+    logging.basicConfig(level=level)
+    logging.getLogger().setLevel(level)
diff --git a/nilmtk_contrib/utils/model.py b/nilmtk_contrib/utils/model.py
new file mode 100644
index 0000000..e5f18eb
--- /dev/null
+++ b/nilmtk_contrib/utils/model.py
@@ -0,0 +1,48 @@
+"""Shared model-level migration helpers."""
+
+from types import MethodType
+
+from nilmtk_contrib.utils.checkpoints import managed_checkpoint_path, unsupported_persistence
+from nilmtk_contrib.utils.logging import configure_logging, get_logger, log_print
+from nilmtk_contrib.utils.random import set_random_seed
+
+
+def _unsupported_save_model(self, *args, **kwargs):
+    model_name = getattr(self, "MODEL_NAME", self.__class__.__name__)
+    unsupported_persistence(model_name)
+
+
+def _unsupported_load_model(self, *args, **kwargs):
+    model_name = getattr(self, "MODEL_NAME", self.__class__.__name__)
+    unsupported_persistence(model_name)
+
+
+def initialize_runtime(model, params, *, backends):
+    """Attach common runtime controls to a model instance."""
+    model.seed = params.get("seed", getattr(model, "seed", None))
+    model.verbose = params.get("verbose", getattr(model, "verbose", False))
+    configure_logging(model.verbose)
+    set_random_seed(model.seed, backends=backends)
+    if not callable(getattr(model, "save_model", None)):
+        model.save_model = MethodType(_unsupported_save_model, model)
+    if not callable(getattr(model, "load_model", None)):
+        model.load_model = MethodType(_unsupported_load_model, model)
+
+
+def module_logger(name):
+    """Return a logger for model modules."""
+    return get_logger(name)
+
+
+def legacy_print(logger):
+    """Return a quiet-by-default print replacement bound to a logger."""
+
+    def _print(*args, **kwargs):
+        log_print(logger, *args, **kwargs)
+
+    return _print
+
+
+def checkpoint_path(suffix):
+    """Return a temporary checkpoint path managed for the process lifetime."""
+    return managed_checkpoint_path(suffix)
diff --git a/nilmtk_contrib/utils/optional_imports.py b/nilmtk_contrib/utils/optional_imports.py
new file mode 100644
index 0000000..85de52e
--- /dev/null
+++ b/nilmtk_contrib/utils/optional_imports.py
@@ -0,0 +1,21 @@
+"""Helpers for optional backend dependencies."""
+
+from importlib import import_module
+
+
+class OptionalDependencyError(ImportError):
+    """Raised when an optional backend dependency is required but missing."""
+
+
+def require_optional(package_name, extra_name, purpose):
+    """Import an optional package or raise an actionable install error."""
+    try:
+        return import_module(package_name)
+    except ModuleNotFoundError as exc:
+        if exc.name != package_name:
+            raise
+        message = (
+            f"{purpose} requires '{package_name}'. "
+            f"Install nilmtk-contrib[{extra_name}]."
+        )
+        raise OptionalDependencyError(message) from exc
diff --git a/nilmtk_contrib/utils/params.py b/nilmtk_contrib/utils/params.py
new file mode 100644
index 0000000..ca06060
--- /dev/null
+++ b/nilmtk_contrib/utils/params.py
@@ -0,0 +1,157 @@
+"""Shared parameter parsing and validation helpers."""
+
+from dataclasses import dataclass
+import warnings
+
+
+@dataclass(frozen=True)
+class CommonParams:
+    sequence_length: int
+    n_epochs: int
+    batch_size: int
+    mains_mean: float
+    mains_std: float
+    appliance_params: dict
+    save_model_path: str | None
+    pretrained_model_path: str | None
+    chunk_wise_training: bool
+    seed: int | None
+    verbose: bool
+    device: str | None
+
+
+DEFAULT_ALIASES = {
+    "save_model_path": ("save-model-path",),
+    "pretrained_model_path": (
+        "pretrained-model-path",
+        "load_model_path",
+        "load-model-path",
+    ),
+}
+
+
+def get_param(params, canonical, default=None, aliases=(), required=False):
+    """Return a parameter by canonical name, accepting deprecated aliases."""
+    if params is None:
+        params = {}
+
+    if canonical in params:
+        return params[canonical]
+
+    for alias in aliases:
+        if alias in params:
+            warnings.warn(
+                f"Parameter '{alias}' is deprecated; use '{canonical}' instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            return params[alias]
+
+    if required:
+        raise ValueError(f"Missing required parameter '{canonical}'.")
+
+    return default
+
+
+def require_odd_sequence_length(sequence_length):
+    """Validate models that require an odd sequence length."""
+    if sequence_length % 2 == 0:
+        raise ValueError("sequence_length must be odd.")
+
+
+def validate_positive_int(name, value):
+    """Validate a positive integer parameter."""
+    if not isinstance(value, int) or isinstance(value, bool) or value <= 0:
+        raise ValueError(f"{name} must be a positive integer.")
+    return value
+
+
+def validate_non_negative_int(name, value):
+    """Validate a non-negative integer parameter."""
+    if not isinstance(value, int) or isinstance(value, bool) or value < 0:
+        raise ValueError(f"{name} must be a non-negative integer.")
+    return value
+
+
+def validate_positive_number(name, value):
+    """Validate a positive numeric parameter."""
+    if isinstance(value, bool) or value <= 0:
+        raise ValueError(f"{name} must be a positive number.")
+    return value
+
+
+def _validate_non_zero_std(name, value):
+    if value == 0:
+        raise ValueError(f"{name} must not be zero.")
+    return value
+
+
+def _validate_appliance_params(appliance_params):
+    for appliance, stats in appliance_params.items():
+        if not isinstance(stats, dict):
+            continue
+        if "std" in stats:
+            _validate_non_zero_std(f"appliance_params[{appliance!r}]['std']", stats["std"])
+    return appliance_params
+
+
+def normalize_common_params(params, defaults):
+    """Normalize common model parameters while preserving legacy aliases."""
+    params = params or {}
+    defaults = defaults or {}
+
+    sequence_length = get_param(
+        params,
+        "sequence_length",
+        default=defaults.get("sequence_length"),
+    )
+    n_epochs = get_param(params, "n_epochs", default=defaults.get("n_epochs"))
+    batch_size = get_param(params, "batch_size", default=defaults.get("batch_size"))
+    mains_mean = get_param(params, "mains_mean", default=defaults.get("mains_mean"))
+    mains_std = get_param(params, "mains_std", default=defaults.get("mains_std"))
+    appliance_params = get_param(
+        params,
+        "appliance_params",
+        default=defaults.get("appliance_params", {}),
+    )
+    save_model_path = get_param(
+        params,
+        "save_model_path",
+        default=defaults.get("save_model_path"),
+        aliases=DEFAULT_ALIASES["save_model_path"],
+    )
+    pretrained_model_path = get_param(
+        params,
+        "pretrained_model_path",
+        default=defaults.get("pretrained_model_path"),
+        aliases=DEFAULT_ALIASES["pretrained_model_path"],
+    )
+    chunk_wise_training = get_param(
+        params,
+        "chunk_wise_training",
+        default=defaults.get("chunk_wise_training", False),
+    )
+    seed = get_param(params, "seed", default=defaults.get("seed"))
+    verbose = get_param(params, "verbose", default=defaults.get("verbose", False))
+    device = get_param(params, "device", default=defaults.get("device"))
+
+    validate_positive_int("sequence_length", sequence_length)
+    validate_non_negative_int("n_epochs", n_epochs)
+    validate_positive_int("batch_size", batch_size)
+    _validate_non_zero_std("mains_std", mains_std)
+    _validate_appliance_params(appliance_params)
+
+    return CommonParams(
+        sequence_length=sequence_length,
+        n_epochs=n_epochs,
+        batch_size=batch_size,
+        mains_mean=mains_mean,
+        mains_std=mains_std,
+        appliance_params=appliance_params,
+        save_model_path=save_model_path,
+        pretrained_model_path=pretrained_model_path,
+        chunk_wise_training=chunk_wise_training,
+        seed=seed,
+        verbose=verbose,
+        device=device,
+    )
diff --git a/nilmtk_contrib/utils/random.py b/nilmtk_contrib/utils/random.py
new file mode 100644
index 0000000..3491fae
--- /dev/null
+++ b/nilmtk_contrib/utils/random.py
@@ -0,0 +1,42 @@
+"""Random seed helpers."""
+
+import random
+
+
+def set_random_seed(seed, backends=("python", "numpy", "torch", "tensorflow")):
+    """Set random seeds for selected backends when they are installed.
+
+    This does not force deterministic backend modes because those can have
+    significant performance and operator-availability tradeoffs.
+    """
+    if seed is None:
+        return
+
+    if "python" in backends:
+        random.seed(seed)
+
+    if "numpy" in backends:
+        try:
+            import numpy as np
+        except ModuleNotFoundError:
+            pass
+        else:
+            np.random.seed(seed)
+
+    if "torch" in backends:
+        try:
+            import torch
+        except ModuleNotFoundError:
+            pass
+        else:
+            torch.manual_seed(seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed_all(seed)
+
+    if "tensorflow" in backends:
+        try:
+            import tensorflow as tf
+        except ModuleNotFoundError:
+            pass
+        else:
+            tf.random.set_seed(seed)
diff --git a/nilmtk_contrib/utils/validation.py b/nilmtk_contrib/utils/validation.py
new file mode 100644
index 0000000..cb2fe8b
--- /dev/null
+++ b/nilmtk_contrib/utils/validation.py
@@ -0,0 +1,216 @@
+"""Safe train/validation splitting helpers."""
+
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass(frozen=True)
+class TrainingDecision:
+    should_train: bool
+    reason: str
+    num_samples: int
+    min_samples: int
+
+
+@dataclass(frozen=True)
+class SplitMetadata:
+    should_train: bool
+    reason: str
+    num_samples: int
+    train_size: int
+    validation_size: int
+    validation_enabled: bool
+    validation_fraction: float
+    strategy: str
+    seed: int | None
+
+
+@dataclass(frozen=True)
+class TrainValidationSplit:
+    X_train: object
+    y_train: object
+    X_val: object | None
+    y_val: object | None
+    metadata: SplitMetadata
+
+
+def should_train(num_samples, min_samples):
+    """Return a structured training decision for a sample count."""
+    if num_samples < min_samples:
+        return TrainingDecision(
+            should_train=False,
+            reason=f"num_samples={num_samples} is below min_samples={min_samples}.",
+            num_samples=num_samples,
+            min_samples=min_samples,
+        )
+
+    return TrainingDecision(
+        should_train=True,
+        reason="enough samples to train.",
+        num_samples=num_samples,
+        min_samples=min_samples,
+    )
+
+
+def _length(values):
+    try:
+        return len(values)
+    except TypeError as exc:
+        raise ValueError("X and y must be sized collections.") from exc
+
+
+def _take(values, indices):
+    if values is None:
+        return None
+    if hasattr(values, "iloc"):
+        return values.iloc[indices]
+    if isinstance(values, (list, tuple)):
+        return type(values)(values[int(index)] for index in indices)
+    return values[indices]
+
+
+def _empty_like(values):
+    if values is None:
+        return None
+    return _take(values, np.asarray([], dtype=int))
+
+
+def train_validation_split(
+    X,
+    y,
+    validation_fraction=0.15,
+    strategy="tail",
+    seed=None,
+    min_train=1,
+    min_val=1,
+    allow_no_validation=False,
+):
+    """Split arrays safely, avoiding empty train or validation sets."""
+    if strategy not in {"tail", "random"}:
+        raise ValueError("strategy must be one of 'tail' or 'random'.")
+    if not 0 < validation_fraction < 1:
+        raise ValueError("validation_fraction must be between 0 and 1.")
+    if min_train < 1:
+        raise ValueError("min_train must be at least 1.")
+    if min_val < 1:
+        raise ValueError("min_val must be at least 1.")
+
+    num_samples = _length(X)
+    if _length(y) != num_samples:
+        raise ValueError("X and y must contain the same number of samples.")
+
+    min_samples_with_validation = min_train + min_val
+    if num_samples < min_samples_with_validation:
+        if not allow_no_validation:
+            metadata = SplitMetadata(
+                should_train=False,
+                reason=(
+                    f"num_samples={num_samples} is below the required "
+                    f"min_train + min_val={min_samples_with_validation}."
+                ),
+                num_samples=num_samples,
+                train_size=0,
+                validation_size=0,
+                validation_enabled=False,
+                validation_fraction=validation_fraction,
+                strategy=strategy,
+                seed=seed,
+            )
+            return TrainValidationSplit(None, None, None, None, metadata)
+
+        decision = should_train(num_samples, min_train)
+        metadata = SplitMetadata(
+            should_train=decision.should_train,
+            reason=(
+                "training without validation because there are not enough "
+                "samples for a validation split."
+                if decision.should_train
+                else decision.reason
+            ),
+            num_samples=num_samples,
+            train_size=num_samples if decision.should_train else 0,
+            validation_size=0,
+            validation_enabled=False,
+            validation_fraction=validation_fraction,
+            strategy=strategy,
+            seed=seed,
+        )
+        if not decision.should_train:
+            return TrainValidationSplit(None, None, None, None, metadata)
+        indices = np.arange(num_samples)
+        return TrainValidationSplit(
+            _take(X, indices),
+            _take(y, indices),
+            _empty_like(X),
+            _empty_like(y),
+            metadata,
+        )
+
+    validation_size = max(min_val, int(round(num_samples * validation_fraction)))
+    validation_size = min(validation_size, num_samples - min_train)
+    train_size = num_samples - validation_size
+
+    if strategy == "tail":
+        train_indices = np.arange(train_size)
+        validation_indices = np.arange(train_size, num_samples)
+    else:
+        rng = np.random.default_rng(seed)
+        indices = rng.permutation(num_samples)
+        validation_indices = np.sort(indices[:validation_size])
+        train_indices = np.sort(indices[validation_size:])
+
+    metadata = SplitMetadata(
+        should_train=True,
+        reason="using train/validation split.",
+        num_samples=num_samples,
+        train_size=len(train_indices),
+        validation_size=len(validation_indices),
+        validation_enabled=True,
+        validation_fraction=validation_fraction,
+        strategy=strategy,
+        seed=seed,
+    )
+    return TrainValidationSplit(
+        _take(X, train_indices),
+        _take(y, train_indices),
+        _take(X, validation_indices),
+        _take(y, validation_indices),
+        metadata,
+    )
+
+
+def safe_train_test_split(*arrays, test_size=0.15, random_state=None, shuffle=True, **_):
+    """Small sklearn-compatible split wrapper with non-empty validation when possible."""
+    if not arrays:
+        raise ValueError("At least one array is required.")
+    num_samples = _length(arrays[0])
+    for array in arrays[1:]:
+        if _length(array) != num_samples:
+            raise ValueError("All arrays must contain the same number of samples.")
+
+    if num_samples < 2:
+        train_indices = np.arange(num_samples)
+        validation_indices = np.asarray([], dtype=int)
+    else:
+        if isinstance(test_size, float):
+            validation_size = max(1, int(round(num_samples * test_size)))
+        else:
+            validation_size = int(test_size)
+        validation_size = min(validation_size, num_samples - 1)
+
+        if shuffle:
+            rng = np.random.default_rng(random_state)
+            indices = rng.permutation(num_samples)
+            validation_indices = np.sort(indices[:validation_size])
+            train_indices = np.sort(indices[validation_size:])
+        else:
+            train_size = num_samples - validation_size
+            train_indices = np.arange(train_size)
+            validation_indices = np.arange(train_size, num_samples)
+
+    split_arrays = []
+    for array in arrays:
+        split_arrays.append(_take(array, train_indices))
+        split_arrays.append(_take(array, validation_indices))
+    return tuple(split_arrays)
diff --git a/pyproject.toml b/pyproject.toml
index db21e8b..6b1bce3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,10 +8,10 @@ allow-direct-references = true
 [project]
 name = "nilmtk-contrib"
 version = "0.1.2"
-description = "State-of-the-art algorithms for energy disaggregation using NILMTK’s Rapid Experimentation API"
+description = "NILMTK-compatible algorithms for energy disaggregation using NILMTK's Rapid Experimentation API"
 readme = "README.md"
 license = { text = "Apache-2.0" }
-requires-python = "==3.11.5"
+requires-python = ">=3.11,<3.12"
 authors = [
   { name = "NILMTK-contrib developers" }
 ]
@@ -26,29 +26,74 @@ classifiers = [
   "Programming Language :: Python :: 3.11",
   "Topic :: Scientific/Engineering :: Mathematics"
 ]
-dependencies = [
-  "tensorflow-io-gcs-filesystem==0.31.0",
-  "nilmtk @ git+https://github.com/nilmtk/nilmtk.git",
-  "tensorflow>=2.12.0,<2.16.0",
-  "cvxpy>=1.0.0", 
-  "torch>=2.0,<2.7",
-  "tqdm>=4.66"
-]
-
-[project.optional-dependencies]
-dev = [
-  "pytest>=7.4.0",
-  "pytest-cov>=4.1.0",
-  "black>=23.0.0",
-  "ruff>=0.0.280"
-]
+dependencies = []
+
+[project.optional-dependencies]
+tensorflow = [
+  "nilmtk @ git+https://github.com/nilmtk/nilmtk.git",
+  "numpy",
+  "pandas",
+  "scikit-learn",
+  "matplotlib",
+  "tensorflow>=2.12.0,<2.16.0",
+  "tensorflow-io-gcs-filesystem==0.31.0"
+]
+torch = [
+  "nilmtk @ git+https://github.com/nilmtk/nilmtk.git",
+  "numpy",
+  "pandas",
+  "scikit-learn",
+  "matplotlib",
+  "torch>=2.0,<2.7",
+  "tqdm>=4.66"
+]
+classical = [
+  "nilmtk @ git+https://github.com/nilmtk/nilmtk.git",
+  "numpy",
+  "pandas",
+  "matplotlib",
+  "scikit-learn",
+  "scipy",
+  "cvxpy>=1.0.0",
+  "hmmlearn"
+]
+nilm = [
+  "nilmtk @ git+https://github.com/nilmtk/nilmtk.git"
+]
+all = [
+  "nilmtk @ git+https://github.com/nilmtk/nilmtk.git",
+  "numpy",
+  "pandas",
+  "scikit-learn",
+  "scipy",
+  "matplotlib",
+  "tensorflow>=2.12.0,<2.16.0",
+  "tensorflow-io-gcs-filesystem==0.31.0",
+  "torch>=2.0,<2.7",
+  "tqdm>=4.66",
+  "cvxpy>=1.0.0",
+  "hmmlearn"
+]
+dev = [
+  "numpy",
+  "pandas",
+  "pytest>=7.4.0",
+  "pytest-cov>=4.1.0",
+  "black>=23.0.0",
+  "ruff>=0.0.280",
+  "build>=1.0.0"
+]
 
 [tool.hatch.version]
 path = "nilmtk_contrib/version.py"
 
-[tool.uv]
-dev-dependencies = [
-  "pytest>=7.4.0",
-  "black>=23.0.0",
-  "ruff>=0.0.280"
-]
+[dependency-groups]
+dev = [
+  "numpy",
+  "pandas",
+  "pytest>=7.4.0",
+  "pytest-cov>=4.1.0",
+  "black>=23.0.0",
+  "ruff>=0.0.280",
+  "build>=1.0.0"
+]
diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py
new file mode 100644
index 0000000..beb0e8f
--- /dev/null
+++ b/tests/test_checkpoints.py
@@ -0,0 +1,114 @@
+import json
+
+import pytest
+
+from nilmtk_contrib.utils.checkpoints import (
+    SCHEMA_VERSION,
+    build_metadata,
+    collect_dependencies,
+    load_metadata,
+    managed_checkpoint_path,
+    save_metadata,
+    temporary_checkpoint,
+    unsupported_persistence,
+)
+
+
+def test_temporary_checkpoint_removes_parent_directory_after_exit():
+    with temporary_checkpoint(".pt") as path:
+        parent = path.parent
+        path.write_text("checkpoint", encoding="utf-8")
+        assert path.exists()
+
+    assert not parent.exists()
+
+
+def test_managed_checkpoint_path_uses_existing_temp_parent():
+    path = managed_checkpoint_path(".pt")
+
+    assert path.name == "checkpoint.pt"
+    assert path.parent.exists()
+
+
+def test_build_save_and_load_metadata(tmp_path):
+    metadata = build_metadata(
+        model_class="DAE",
+        backend="torch",
+        sequence_length=99,
+        appliance_params={"fridge": {"mean": 10, "std": 2}},
+        mains_mean=1000,
+        mains_std=600,
+        dependencies={"torch": "2.0.0"},
+    )
+
+    save_metadata(tmp_path, metadata)
+    loaded = load_metadata(
+        tmp_path,
+        expected_model_class="DAE",
+        expected_backend="torch",
+    )
+
+    assert loaded["schema_version"] == SCHEMA_VERSION
+    assert loaded["model_class"] == "DAE"
+    assert loaded["backend"] == "torch"
+    assert loaded["sequence_length"] == 99
+    assert loaded["appliance_params"] == {"fridge": {"mean": 10, "std": 2}}
+    assert loaded["mains_mean"] == 1000
+    assert loaded["mains_std"] == 600
+    assert loaded["dependencies"] == {"torch": "2.0.0"}
+    assert "created_at" in loaded
+
+
+def test_load_metadata_rejects_missing_fields(tmp_path):
+    (tmp_path / "metadata.json").write_text(
+        json.dumps({"schema_version": SCHEMA_VERSION}),
+        encoding="utf-8",
+    )
+
+    with pytest.raises(ValueError, match="Missing metadata fields"):
+        load_metadata(tmp_path)
+
+
+def test_load_metadata_rejects_schema_mismatch(tmp_path):
+    metadata = build_metadata(
+        model_class="DAE",
+        backend="torch",
+        sequence_length=99,
+        appliance_params={},
+        mains_mean=1000,
+        mains_std=600,
+    )
+    metadata["schema_version"] = 999
+    save_metadata(tmp_path, metadata)
+
+    with pytest.raises(ValueError, match="Unsupported metadata schema_version"):
+        load_metadata(tmp_path)
+
+
+def test_load_metadata_rejects_wrong_model_or_backend(tmp_path):
+    metadata = build_metadata(
+        model_class="DAE",
+        backend="torch",
+        sequence_length=99,
+        appliance_params={},
+        mains_mean=1000,
+        mains_std=600,
+    )
+    save_metadata(tmp_path, metadata)
+
+    with pytest.raises(ValueError, match="Expected model_class"):
+        load_metadata(tmp_path, expected_model_class="Seq2Point")
+
+    with pytest.raises(ValueError, match="Expected backend"):
+        load_metadata(tmp_path, expected_backend="tensorflow")
+
+
+def test_collect_dependencies_marks_missing_package_as_none():
+    dependencies = collect_dependencies(["definitely-missing-nilmtk-contrib-package"])
+
+    assert dependencies == {"definitely-missing-nilmtk-contrib-package": None}
+
+
+def test_unsupported_persistence_raises_with_model_name():
+    with pytest.raises(NotImplementedError, match="AFHMM"):
+        unsupported_persistence("AFHMM")
diff --git a/tests/test_imports.py b/tests/test_imports.py
new file mode 100644
index 0000000..416aa3c
--- /dev/null
+++ b/tests/test_imports.py
@@ -0,0 +1,79 @@
+import importlib
+import json
+import subprocess
+import sys
+
+import pytest
+
+from nilmtk_contrib.utils.optional_imports import OptionalDependencyError, require_optional
+
+
+BACKEND_MODULES = {"tensorflow", "torch", "cvxpy", "hmmlearn", "nilmtk", "pandas"}
+
+
+def _imported_modules_after(statement):
+    code = (
+        "import json, sys\n"
+        f"{statement}\n"
+        f"print(json.dumps(sorted({BACKEND_MODULES!r}.intersection(sys.modules))))"
+    )
+    output = subprocess.check_output([sys.executable, "-c", code], text=True)
+    return set(json.loads(output))
+
+
+def test_top_level_import_is_lightweight():
+    imported = _imported_modules_after("import nilmtk_contrib")
+    assert imported == set()
+
+
+def test_disaggregate_package_import_is_lightweight():
+    imported = _imported_modules_after("import nilmtk_contrib.disaggregate")
+    assert imported == set()
+
+
+def test_torch_package_import_is_lightweight():
+    imported = _imported_modules_after("import nilmtk_contrib.torch")
+    assert imported == set()
+
+
+def test_mains_stats_import_does_not_import_nilmtk():
+    imported = _imported_modules_after("import nilmtk_contrib.mains_stats")
+    assert imported == set()
+
+
+def test_require_optional_error_message():
+    with pytest.raises(OptionalDependencyError) as exc_info:
+        require_optional(
+            "definitely_missing_nilmtk_contrib_dependency",
+            "dev",
+            "Import test",
+        )
+
+    assert str(exc_info.value) == (
+        "Import test requires 'definitely_missing_nilmtk_contrib_dependency'. "
+        "Install nilmtk-contrib[dev]."
+    )
+
+
+@pytest.mark.parametrize(
+    ("package_name", "class_name"),
+    [
+        ("nilmtk_contrib.disaggregate", "DAE"),
+        ("nilmtk_contrib.disaggregate", "AFHMM"),
+        ("nilmtk_contrib.torch", "DAE"),
+    ],
+)
+def test_backend_exports_succeed_or_raise_optional_dependency_message(
+    package_name,
+    class_name,
+):
+    package = importlib.import_module(package_name)
+
+    try:
+        getattr(package, class_name)
+    except OptionalDependencyError as exc:
+        message = str(exc)
+        assert f"{class_name} requires '" in message
+        assert "Install nilmtk-contrib[" in message
+    except ImportError as exc:
+        pytest.fail(f"Unexpected non-optional import failure: {exc}")
diff --git a/tests/test_model_runtime.py b/tests/test_model_runtime.py
new file mode 100644
index 0000000..98f45f4
--- /dev/null
+++ b/tests/test_model_runtime.py
@@ -0,0 +1,38 @@
+import pytest
+
+from nilmtk_contrib.utils.model import initialize_runtime
+
+
+class RuntimeOnlyModel:
+    pass
+
+
+class PersistentModel:
+    def save_model(self):
+        return "saved"
+
+    def load_model(self):
+        return "loaded"
+
+
+def test_initialize_runtime_adds_clear_persistence_fallbacks():
+    model = RuntimeOnlyModel()
+    model.MODEL_NAME = "RuntimeOnly"
+
+    initialize_runtime(model, {"seed": 123, "verbose": False}, backends=("python",))
+
+    assert model.seed == 123
+    assert model.verbose is False
+    with pytest.raises(NotImplementedError, match="RuntimeOnly"):
+        model.save_model()
+    with pytest.raises(NotImplementedError, match="RuntimeOnly"):
+        model.load_model()
+
+
+def test_initialize_runtime_preserves_real_persistence_methods():
+    model = PersistentModel()
+
+    initialize_runtime(model, {}, backends=("python",))
+
+    assert model.save_model() == "saved"
+    assert model.load_model() == "loaded"
diff --git a/tests/test_params.py b/tests/test_params.py
new file mode 100644
index 0000000..9045b4b
--- /dev/null
+++ b/tests/test_params.py
@@ -0,0 +1,170 @@
+import pytest
+
+from nilmtk_contrib.utils.params import (
+    get_param,
+    normalize_common_params,
+    require_odd_sequence_length,
+    validate_non_negative_int,
+    validate_positive_int,
+    validate_positive_number,
+)
+
+
+DEFAULTS = {
+    "sequence_length": 99,
+    "n_epochs": 10,
+    "batch_size": 512,
+    "mains_mean": 1000,
+    "mains_std": 600,
+    "appliance_params": {},
+    "save_model_path": None,
+    "pretrained_model_path": None,
+    "chunk_wise_training": False,
+    "seed": None,
+    "verbose": False,
+    "device": None,
+}
+
+
+def test_get_param_prefers_canonical_name_over_alias():
+    value = get_param(
+        {"sequence_length": 101, "seq_len": 99},
+        "sequence_length",
+        aliases=("seq_len",),
+    )
+
+    assert value == 101
+
+
+def test_get_param_alias_warns():
+    with pytest.warns(DeprecationWarning, match="save-model-path"):
+        value = get_param(
+            {"save-model-path": "old-path"},
+            "save_model_path",
+            aliases=("save-model-path",),
+        )
+
+    assert value == "old-path"
+
+
+def test_get_param_required_missing_fails():
+    with pytest.raises(ValueError, match="Missing required parameter 'sequence_length'"):
+        get_param({}, "sequence_length", required=True)
+
+
+def test_normalize_common_params_uses_defaults():
+    params = normalize_common_params({}, DEFAULTS)
+
+    assert params.sequence_length == 99
+    assert params.n_epochs == 10
+    assert params.batch_size == 512
+    assert params.mains_mean == 1000
+    assert params.mains_std == 600
+    assert params.appliance_params == {}
+    assert params.save_model_path is None
+    assert params.pretrained_model_path is None
+    assert params.chunk_wise_training is False
+    assert params.seed is None
+    assert params.verbose is False
+    assert params.device is None
+
+
+def test_normalize_common_params_accepts_canonical_names():
+    params = normalize_common_params(
+        {
+            "sequence_length": 101,
+            "n_epochs": 0,
+            "batch_size": 64,
+            "mains_mean": 500,
+            "mains_std": 250,
+            "appliance_params": {"fridge": {"mean": 75, "std": 25}},
+            "save_model_path": "save",
+            "pretrained_model_path": "load",
+            "chunk_wise_training": True,
+            "seed": 123,
+            "verbose": True,
+            "device": "cpu",
+        },
+        DEFAULTS,
+    )
+
+    assert params.sequence_length == 101
+    assert params.n_epochs == 0
+    assert params.batch_size == 64
+    assert params.mains_mean == 500
+    assert params.mains_std == 250
+    assert params.appliance_params == {"fridge": {"mean": 75, "std": 25}}
+    assert params.save_model_path == "save"
+    assert params.pretrained_model_path == "load"
+    assert params.chunk_wise_training is True
+    assert params.seed == 123
+    assert params.verbose is True
+    assert params.device == "cpu"
+
+
+def test_normalize_common_params_accepts_legacy_path_aliases():
+    with pytest.warns(DeprecationWarning) as warnings:
+        params = normalize_common_params(
+            {
+                "save-model-path": "save",
+                "pretrained-model-path": "load",
+            },
+            DEFAULTS,
+        )
+
+    assert params.save_model_path == "save"
+    assert params.pretrained_model_path == "load"
+    assert len(warnings) == 2
+
+
+@pytest.mark.parametrize("alias", ["load_model_path", "load-model-path"])
+def test_normalize_common_params_accepts_load_model_aliases(alias):
+    with pytest.warns(DeprecationWarning, match=alias):
+        params = normalize_common_params({alias: "load"}, DEFAULTS)
+
+    assert params.pretrained_model_path == "load"
+
+
+@pytest.mark.parametrize(
+    ("field", "value", "message"),
+    [
+        ("sequence_length", 0, "sequence_length must be a positive integer"),
+        ("sequence_length", 99.5, "sequence_length must be a positive integer"),
+        ("n_epochs", -1, "n_epochs must be a non-negative integer"),
+        ("batch_size", 0, "batch_size must be a positive integer"),
+        ("mains_std", 0, "mains_std must not be zero"),
+    ],
+)
+def test_normalize_common_params_validates_common_values(field, value, message):
+    with pytest.raises(ValueError, match=message):
+        normalize_common_params({field: value}, DEFAULTS)
+
+
+def test_normalize_common_params_validates_appliance_std():
+    with pytest.raises(ValueError, match=r"appliance_params\['fridge'\]\['std'\]"):
+        normalize_common_params(
+            {"appliance_params": {"fridge": {"mean": 75, "std": 0}}},
+            DEFAULTS,
+        )
+
+
+def test_require_odd_sequence_length_accepts_odd_values():
+    require_odd_sequence_length(99)
+
+
+def test_require_odd_sequence_length_rejects_even_values():
+    with pytest.raises(ValueError, match="sequence_length must be odd"):
+        require_odd_sequence_length(100)
+
+
+def test_model_specific_parameter_validators():
+    assert validate_positive_int("time_period", 720) == 720
+    assert validate_non_negative_int("iterations", 0) == 0
+    assert validate_positive_number("learning_rate", 1e-9) == 1e-9
+
+    with pytest.raises(ValueError, match="time_period"):
+        validate_positive_int("time_period", 0)
+    with pytest.raises(ValueError, match="iterations"):
+        validate_non_negative_int("iterations", -1)
+    with pytest.raises(ValueError, match="learning_rate"):
+        validate_positive_number("learning_rate", 0)
diff --git a/tests/test_preprocessing_alignment.py b/tests/test_preprocessing_alignment.py
new file mode 100644
index 0000000..b04be15
--- /dev/null
+++ b/tests/test_preprocessing_alignment.py
@@ -0,0 +1,41 @@
+import pandas as pd
+import pytest
+
+from nilmtk_contrib.preprocessing.alignment import restore_index
+
+
+def test_restore_index_from_array_returns_series():
+    index = pd.date_range("2026-01-01", periods=3, freq="min")
+
+    restored = restore_index([1, 2, 3], index)
+
+    assert isinstance(restored, pd.Series)
+    assert restored.index.equals(index)
+    assert restored.tolist() == [1, 2, 3]
+
+
+def test_restore_index_preserves_series_name():
+    index = pd.date_range("2026-01-01", periods=2, freq="min")
+    predictions = pd.Series([5, 6], name="fridge")
+
+    restored = restore_index(predictions, index)
+
+    assert restored.name == "fridge"
+    assert restored.index.equals(index)
+
+
+def test_restore_index_preserves_dataframe_columns():
+    index = pd.date_range("2026-01-01", periods=2, freq="min")
+    predictions = pd.DataFrame({"fridge": [5, 6], "kettle": [0, 1]})
+
+    restored = restore_index(predictions, index)
+
+    assert restored.columns.tolist() == ["fridge", "kettle"]
+    assert restored.index.equals(index)
+
+
+def test_restore_index_rejects_length_mismatch():
+    index = pd.date_range("2026-01-01", periods=2, freq="min")
+
+    with pytest.raises(ValueError, match="same length"):
+        restore_index([1, 2, 3], index)
diff --git a/tests/test_preprocessing_classification.py b/tests/test_preprocessing_classification.py
new file mode 100644
index 0000000..60eba13
--- /dev/null
+++ b/tests/test_preprocessing_classification.py
@@ -0,0 +1,52 @@
+import pytest
+
+from nilmtk_contrib.preprocessing.classification import (
+    appliance_threshold,
+    classification_metadata,
+    loss_weight_metadata,
+    make_on_off_labels,
+)
+
+
+def test_appliance_threshold_prefers_appliance_specific_value():
+    params = {"fridge": {"on_power_threshold": 25}}
+
+    assert appliance_threshold(params, "fridge", default_threshold=15) == 25
+
+
+def test_appliance_threshold_requires_explicit_threshold():
+    with pytest.raises(ValueError, match="Missing on/off threshold"):
+        appliance_threshold({}, "fridge")
+
+
+def test_classification_metadata_is_serializable():
+    metadata = classification_metadata(
+        {
+            "fridge": {"on_power_threshold": 25},
+            "kettle": {"threshold": 1000},
+        },
+        default_threshold=15,
+    )
+
+    assert metadata == {
+        "default_threshold": 15,
+        "appliances": {
+            "fridge": {"on_power_threshold": 25},
+            "kettle": {"on_power_threshold": 1000},
+        },
+    }
+
+
+def test_loss_weight_metadata_rejects_non_positive_weights():
+    assert loss_weight_metadata(2.0, 0.5) == {
+        "regression": 2.0,
+        "classification": 0.5,
+    }
+    with pytest.raises(ValueError, match="regression_weight"):
+        loss_weight_metadata(0, 1)
+    with pytest.raises(ValueError, match="classification_weight"):
+        loss_weight_metadata(1, 0)
+
+
+def test_make_on_off_labels_uses_explicit_threshold():
+    assert make_on_off_labels([1, 15, 16], threshold=15).tolist() == [0, 1, 1]
diff --git a/tests/test_preprocessing_windows.py b/tests/test_preprocessing_windows.py
new file mode 100644
index 0000000..f51f0fc
--- /dev/null
+++ b/tests/test_preprocessing_windows.py
@@ -0,0 +1,100 @@
+import numpy as np
+import pytest
+
+from nilmtk_contrib.preprocessing.classification import make_on_off_labels
+from nilmtk_contrib.preprocessing.normalization import denormalize, normalize
+from nilmtk_contrib.preprocessing.windows import (
+    make_sliding_windows,
+    overlap_average,
+    sequence_to_point_targets,
+)
+
+
+def test_center_padded_windows_match_original_length():
+    windows, metadata = make_sliding_windows([1, 2, 3], 3, pad="center")
+
+    assert windows.tolist() == [[0, 1, 2], [1, 2, 3], [2, 3, 0]]
+    assert len(windows) == 3
+    assert metadata.original_length == 3
+    assert metadata.pad_left == 1
+    assert metadata.pad_right == 1
+
+
+def test_center_padded_windows_handle_short_input():
+    windows, metadata = make_sliding_windows([5], 5, pad="center")
+
+    assert windows.tolist() == [[0, 0, 5, 0, 0]]
+    assert metadata.original_length == 1
+    assert metadata.trim_slice == (2, 3)
+
+
+def test_right_padded_windows_match_original_length():
+    windows, metadata = make_sliding_windows([1, 2, 3], 3, pad="right")
+
+    assert windows.tolist() == [[1, 2, 3], [2, 3, 0], [3, 0, 0]]
+    assert len(windows) == 3
+    assert metadata.pad_left == 0
+    assert metadata.pad_right == 2
+
+
+def test_unpadded_windows_use_only_complete_windows():
+    windows, metadata = make_sliding_windows([1, 2, 3, 4], 3, pad="none")
+
+    assert windows.tolist() == [[1, 2, 3], [2, 3, 4]]
+    assert metadata.original_length == 4
+    assert metadata.pad_left == 0
+    assert metadata.pad_right == 0
+
+
+def test_unpadded_windows_short_input_returns_empty_rows():
+    windows, _ = make_sliding_windows([1, 2], 3, pad="none")
+
+    assert windows.shape == (0, 3)
+
+
+def test_make_sliding_windows_validates_arguments():
+    with pytest.raises(ValueError, match="window_length must be a positive integer"):
+        make_sliding_windows([1, 2, 3], 0)
+
+    with pytest.raises(ValueError, match="pad must be one of"):
+        make_sliding_windows([1, 2, 3], 3, pad="left")
+
+
+def test_sequence_to_point_targets_use_center_values():
+    targets = sequence_to_point_targets([10, 20, 30], 3, center=True)
+
+    assert targets.tolist() == [10, 20, 30]
+
+
+def test_sequence_to_point_targets_non_center_uses_right_edge():
+    targets = sequence_to_point_targets([10, 20, 30, 40], 3, center=False)
+
+    assert targets.tolist() == [30, 40]
+
+
+def test_overlap_average_combines_known_windows():
+    averaged = overlap_average(np.array([[1, 2, 3], [4, 5, 6]]), original_length=4)
+
+    assert averaged.tolist() == [1, 3, 4, 6]
+
+
+def test_overlap_average_trims_center_excess():
+    averaged = overlap_average(np.array([[1, 2, 3], [4, 5, 6]]), original_length=2)
+
+    assert averaged.tolist() == [3, 4]
+
+
+def test_normalize_records_fallback_std_and_denormalizes():
+    normalized, metadata = normalize([100, 200], mean=100, std=0)
+
+    assert normalized.tolist() == [0, 1]
+    assert metadata.requested_std == 0
+    assert metadata.std_used == 100
+    assert denormalize(normalized, mean=100, std=metadata.std_used).tolist() == [100, 200]
+
+
+def test_make_on_off_labels_requires_explicit_threshold():
+    assert make_on_off_labels([0, 10, 20], threshold=10).tolist() == [0, 1, 1]
+
+    with pytest.raises(ValueError, match="threshold must be explicit"):
+        make_on_off_labels([0, 10], threshold=None)
diff --git a/tests/test_random_logging.py b/tests/test_random_logging.py
new file mode 100644
index 0000000..40708e2
--- /dev/null
+++ b/tests/test_random_logging.py
@@ -0,0 +1,38 @@
+import logging
+import random
+
+import numpy as np
+
+from nilmtk_contrib.utils.logging import configure_logging, get_logger
+from nilmtk_contrib.utils.random import set_random_seed
+
+
+def test_set_random_seed_controls_python_and_numpy():
+    set_random_seed(123, backends=("python", "numpy"))
+    first_python = random.random()
+    first_numpy = np.random.rand()
+
+    set_random_seed(123, backends=("python", "numpy"))
+    second_python = random.random()
+    second_numpy = np.random.rand()
+
+    assert first_python == second_python
+    assert first_numpy == second_numpy
+
+
+def test_set_random_seed_ignores_none_seed():
+    set_random_seed(None, backends=("python", "numpy"))
+
+
+def test_get_logger_returns_named_logger():
+    logger = get_logger("nilmtk_contrib.test")
+
+    assert logger.name == "nilmtk_contrib.test"
+
+
+def test_configure_logging_sets_expected_root_level():
+    configure_logging(verbose=True)
+    assert logging.getLogger().level <= logging.INFO
+
+    configure_logging(verbose=False)
+    assert logging.getLogger().level <= logging.WARNING
diff --git a/tests/test_validation.py b/tests/test_validation.py
new file mode 100644
index 0000000..4ca71d0
--- /dev/null
+++ b/tests/test_validation.py
@@ -0,0 +1,181 @@
+import numpy as np
+import pandas as pd
+
+from nilmtk_contrib.utils.validation import (
+    safe_train_test_split,
+    should_train,
+    train_validation_split,
+)
+
+
+def test_should_train_reports_skip_reason():
+    decision = should_train(num_samples=1, min_samples=2)
+
+    assert decision.should_train is False
+    assert decision.num_samples == 1
+    assert decision.min_samples == 2
+    assert "below" in decision.reason
+
+
+def test_should_train_reports_trainable_input():
+    decision = should_train(num_samples=2, min_samples=2)
+
+    assert decision.should_train is True
+    assert decision.reason == "enough samples to train."
+
+
+def test_tail_split_guarantees_validation_sample():
+    split = train_validation_split(
+        np.arange(10),
+        np.arange(10) + 100,
+        validation_fraction=0.01,
+    )
+
+    assert split.metadata.should_train is True
+    assert split.metadata.validation_enabled is True
+    assert split.metadata.train_size == 9
+    assert split.metadata.validation_size == 1
+    assert split.X_train.tolist() == list(range(9))
+    assert split.X_val.tolist() == [9]
+    assert split.y_val.tolist() == [109]
+
+
+def test_tiny_dataset_skips_when_validation_is_required():
+    split = train_validation_split(
+        np.asarray([1]),
+        np.asarray([10]),
+        min_train=1,
+        min_val=1,
+        allow_no_validation=False,
+    )
+
+    assert split.metadata.should_train is False
+    assert split.metadata.validation_enabled is False
+    assert split.X_train is None
+    assert "min_train + min_val" in split.metadata.reason
+
+
+def test_tiny_dataset_can_train_without_validation_when_allowed():
+    split = train_validation_split(
+        np.asarray([1]),
+        np.asarray([10]),
+        min_train=1,
+        min_val=1,
+        allow_no_validation=True,
+    )
+
+    assert split.metadata.should_train is True
+    assert split.metadata.validation_enabled is False
+    assert split.metadata.train_size == 1
+    assert split.metadata.validation_size == 0
+    assert split.X_train.tolist() == [1]
+    assert split.X_val.size == 0
+
+
+def test_empty_dataset_skips_even_when_no_validation_allowed():
+    split = train_validation_split(
+        np.asarray([]),
+        np.asarray([]),
+        min_train=1,
+        min_val=1,
+        allow_no_validation=True,
+    )
+
+    assert split.metadata.should_train is False
+    assert split.metadata.train_size == 0
+    assert split.X_train is None
+
+
+def test_random_split_is_deterministic_with_seed():
+    first = train_validation_split(
+        np.arange(20),
+        np.arange(20),
+        validation_fraction=0.25,
+        strategy="random",
+        seed=123,
+    )
+    second = train_validation_split(
+        np.arange(20),
+        np.arange(20),
+        validation_fraction=0.25,
+        strategy="random",
+        seed=123,
+    )
+
+    assert first.X_train.tolist() == second.X_train.tolist()
+    assert first.X_val.tolist() == second.X_val.tolist()
+    assert first.metadata.validation_size == 5
+
+
+def test_split_preserves_pandas_objects_and_indices():
+    X = pd.DataFrame({"mains": [1, 2, 3, 4]}, index=list("abcd"))
+    y = pd.Series([10, 20, 30, 40], index=list("abcd"), name="fridge")
+
+    split = train_validation_split(X, y, validation_fraction=0.25)
+
+    assert isinstance(split.X_train, pd.DataFrame)
+    assert isinstance(split.y_val, pd.Series)
+    assert split.X_train.index.tolist() == ["a", "b", "c"]
+    assert split.y_val.index.tolist() == ["d"]
+    assert split.y_val.name == "fridge"
+
+
+def test_split_supports_plain_lists():
+    split = train_validation_split(
+        ["a", "b", "c", "d"],
+        [1, 2, 3, 4],
+        validation_fraction=0.5,
+    )
+
+    assert split.X_train == ["a", "b"]
+    assert split.y_train == [1, 2]
+    assert split.X_val == ["c", "d"]
+    assert split.y_val == [3, 4]
+
+
+def test_split_rejects_invalid_arguments():
+    invalid_cases = [
+        {"strategy": "middle"},
+        {"validation_fraction": 0},
+        {"validation_fraction": 1},
+        {"min_train": 0},
+        {"min_val": 0},
+    ]
+
+    for kwargs in invalid_cases:
+        try:
+            train_validation_split(np.arange(3), np.arange(3), **kwargs)
+        except ValueError:
+            pass
+        else:
+            raise AssertionError(f"Expected ValueError for {kwargs}")
+
+
+def test_split_rejects_length_mismatch():
+    try:
+        train_validation_split(np.arange(3), np.arange(2))
+    except ValueError as exc:
+        assert "same number of samples" in str(exc)
+    else:
+        raise AssertionError("Expected ValueError")
+
+
+def test_safe_train_test_split_guarantees_validation_when_possible():
+    train_x, val_x, train_y, val_y = safe_train_test_split(
+        np.arange(3),
+        np.arange(3) + 10,
+        test_size=0.15,
+        random_state=1,
+    )
+
+    assert len(train_x) == 2
+    assert len(val_x) == 1
+    assert len(train_y) == 2
+    assert len(val_y) == 1
+
+
+def test_safe_train_test_split_handles_single_sample():
+    train_x, val_x = safe_train_test_split(np.asarray([1]), test_size=0.15)
+
+    assert train_x.tolist() == [1]
+    assert val_x.size == 0
diff --git a/uv.lock b/uv.lock
index 4cf56a6..6151e14 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,6 +1,6 @@
 version = 1
-revision = 2
-requires-python = "==3.11.5"
+revision = 3
+requires-python = "==3.11.*"
 
 [[package]]
 name = "absl-py"
@@ -75,6 +75,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5b/70/12c9490bae7c2f4692627e17d916fc002b6812453adcbb834cd2c24c298f/blosc2-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:ad6fa89117102a25344f311c45f59d9c8a36a647cc54402da47385cce6f56f7a", size = 2199425, upload-time = "2025-06-24T15:28:42.351Z" },
 ]
 
+[[package]]
+name = "build"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "os_name == 'nt'" },
+    { name = "packaging" },
+    { name = "pyproject-hooks" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/78/e0/df5e171f685f82f37b12e1f208064e24244911079d7b767447d1af7e0d70/build-1.5.0.tar.gz", hash = "sha256:302c22c3ba2a0fd5f3911918651341ebb3896176cbdec15bd421f80b1afc7647", size = 89796, upload-time = "2026-04-30T03:18:25.17Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/fe/6bea5c9162869c5beba5d9c8abbed835ec85bf1ec1fba05a3822325c45f3/build-1.5.0-py3-none-any.whl", hash = "sha256:13f3eecb844759ab66efec90ca17639bbf14dc06cb2fdf37a9010322d9c50a6f", size = 26018, upload-time = "2026-04-30T03:18:23.644Z" },
+]
+
 [[package]]
 name = "cachetools"
 version = "5.5.2"
@@ -222,6 +236,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/08/b8/7ddd1e8ba9701dea08ce22029917140e6f66a859427406579fd8d0ca7274/coverage-7.9.1-py3-none-any.whl", hash = "sha256:66b974b145aa189516b6bf2d8423e888b742517d37872f6ee4c5be0073bd9a3c", size = 204000, upload-time = "2025-06-13T13:02:27.173Z" },
 ]
 
+[package.optional-dependencies]
+toml = [
+    { name = "tomli", marker = "python_full_version <= '3.11'" },
+]
+
 [[package]]
 name = "cvxpy"
 version = "1.6.6"
@@ -694,7 +713,7 @@ wheels = [
 
 [[package]]
 name = "nilm-metadata"
-version = "0.2.6.dev4+g9082f10"
+version = "0.2.6.dev4+g9082f10c2"
 source = { git = "https://github.com/nilmtk/nilm_metadata.git#9082f10c20f0120b1ff80db2fc8556dc74ca6a80" }
 dependencies = [
     { name = "pandas" },
@@ -726,49 +745,129 @@ dependencies = [
 name = "nilmtk-contrib"
 version = "0.1.2"
 source = { editable = "." }
-dependencies = [
+
+[package.optional-dependencies]
+all = [
     { name = "cvxpy" },
+    { name = "hmmlearn" },
+    { name = "matplotlib" },
     { name = "nilmtk" },
+    { name = "numpy" },
+    { name = "pandas" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
     { name = "tensorflow" },
     { name = "tensorflow-io-gcs-filesystem" },
     { name = "torch" },
     { name = "tqdm" },
 ]
-
-[package.optional-dependencies]
+classical = [
+    { name = "cvxpy" },
+    { name = "hmmlearn" },
+    { name = "matplotlib" },
+    { name = "nilmtk" },
+    { name = "numpy" },
+    { name = "pandas" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+]
 dev = [
     { name = "black" },
+    { name = "build" },
+    { name = "numpy" },
+    { name = "pandas" },
     { name = "pytest" },
     { name = "pytest-cov" },
     { name = "ruff" },
 ]
+nilm = [
+    { name = "nilmtk" },
+]
+tensorflow = [
+    { name = "matplotlib" },
+    { name = "nilmtk" },
+    { name = "numpy" },
+    { name = "pandas" },
+    { name = "scikit-learn" },
+    { name = "tensorflow" },
+    { name = "tensorflow-io-gcs-filesystem" },
+]
+torch = [
+    { name = "matplotlib" },
+    { name = "nilmtk" },
+    { name = "numpy" },
+    { name = "pandas" },
+    { name = "scikit-learn" },
+    { name = "torch" },
+    { name = "tqdm" },
+]
 
 [package.dev-dependencies]
 dev = [
     { name = "black" },
+    { name = "build" },
+    { name = "numpy" },
+    { name = "pandas" },
     { name = "pytest" },
+    { name = "pytest-cov" },
     { name = "ruff" },
 ]
 
 [package.metadata]
 requires-dist = [
     { name = "black", marker = "extra == 'dev'", specifier = ">=23.0.0" },
-    { name = "cvxpy", specifier = ">=1.0.0" },
-    { name = "nilmtk", git = "https://github.com/nilmtk/nilmtk.git" },
+    { name = "build", marker = "extra == 'dev'", specifier = ">=1.0.0" },
+    { name = "cvxpy", marker = "extra == 'all'", specifier = ">=1.0.0" },
+    { name = "cvxpy", marker = "extra == 'classical'", specifier = ">=1.0.0" },
+    { name = "hmmlearn", marker = "extra == 'all'" },
+    { name = "hmmlearn", marker = "extra == 'classical'" },
+    { name = "matplotlib", marker = "extra == 'all'" },
+    { name = "matplotlib", marker = "extra == 'classical'" },
+    { name = "matplotlib", marker = "extra == 'tensorflow'" },
+    { name = "matplotlib", marker = "extra == 'torch'" },
+    { name = "nilmtk", marker = "extra == 'all'", git = "https://github.com/nilmtk/nilmtk.git" },
+    { name = "nilmtk", marker = "extra == 'classical'", git = "https://github.com/nilmtk/nilmtk.git" },
+    { name = "nilmtk", marker = "extra == 'nilm'", git = "https://github.com/nilmtk/nilmtk.git" },
+    { name = "nilmtk", marker = "extra == 'tensorflow'", git = "https://github.com/nilmtk/nilmtk.git" },
+    { name = "nilmtk", marker = "extra == 'torch'", git = "https://github.com/nilmtk/nilmtk.git" },
+    { name = "numpy", marker = "extra == 'all'" },
+    { name = "numpy", marker = "extra == 'classical'" },
+    { name = "numpy", marker = "extra == 'dev'" },
+    { name = "numpy", marker = "extra == 'tensorflow'" },
+    { name = "numpy", marker = "extra == 'torch'" },
+    { name = "pandas", marker = "extra == 'all'" },
+    { name = "pandas", marker = "extra == 'classical'" },
+    { name = "pandas", marker = "extra == 'dev'" },
+    { name = "pandas", marker = "extra == 'tensorflow'" },
+    { name = "pandas", marker = "extra == 'torch'" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.4.0" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.1.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.0.280" },
-    { name = "tensorflow", specifier = ">=2.12.0,<2.16.0" },
-    { name = "tensorflow-io-gcs-filesystem", specifier = "==0.31.0" },
-    { name = "torch", specifier = ">=2.0,<2.7" },
-    { name = "tqdm", specifier = ">=4.66" },
-]
-provides-extras = ["dev"]
+    { name = "scikit-learn", marker = "extra == 'all'" },
+    { name = "scikit-learn", marker = "extra == 'classical'" },
+    { name = "scikit-learn", marker = "extra == 'tensorflow'" },
+    { name = "scikit-learn", marker = "extra == 'torch'" },
+    { name = "scipy", marker = "extra == 'all'" },
+    { name = "scipy", marker = "extra == 'classical'" },
+    { name = "tensorflow", marker = "extra == 'all'", specifier = ">=2.12.0,<2.16.0" },
+    { name = "tensorflow", marker = "extra == 'tensorflow'", specifier = ">=2.12.0,<2.16.0" },
+    { name = "tensorflow-io-gcs-filesystem", marker = "extra == 'all'", specifier = "==0.31.0" },
+    { name = "tensorflow-io-gcs-filesystem", marker = "extra == 'tensorflow'", specifier = "==0.31.0" },
+    { name = "torch", marker = "extra == 'all'", specifier = ">=2.0,<2.7" },
+    { name = "torch", marker = "extra == 'torch'", specifier = ">=2.0,<2.7" },
+    { name = "tqdm", marker = "extra == 'all'", specifier = ">=4.66" },
+    { name = "tqdm", marker = "extra == 'torch'", specifier = ">=4.66" },
+]
+provides-extras = ["tensorflow", "torch", "classical", "nilm", "all", "dev"]
 
 [package.metadata.requires-dev]
 dev = [
     { name = "black", specifier = ">=23.0.0" },
+    { name = "build", specifier = ">=1.0.0" },
+    { name = "numpy" },
+    { name = "pandas" },
     { name = "pytest", specifier = ">=7.4.0" },
+    { name = "pytest-cov", specifier = ">=4.1.0" },
     { name = "ruff", specifier = ">=0.0.280" },
 ]
 
@@ -1174,6 +1273,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl", hash = "sha256:a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf", size = 111120, upload-time = "2025-03-25T05:01:24.908Z" },
 ]
 
+[[package]]
+name = "pyproject-hooks"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/82/28175b2414effca1cdac8dc99f76d660e7a4fb0ceefa4b4ab8f5f6742925/pyproject_hooks-1.2.0.tar.gz", hash = "sha256:1e859bd5c40fae9448642dd871adf459e5e2084186e8d2c2a79a824c970da1f8", size = 19228, upload-time = "2024-09-29T09:24:13.293Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bd/24/12818598c362d7f300f18e74db45963dbcb85150324092410c8b49405e42/pyproject_hooks-1.2.0-py3-none-any.whl", hash = "sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913", size = 10216, upload-time = "2024-09-29T09:24:11.978Z" },
+]
+
 [[package]]
 name = "pytest"
 version = "8.4.1"
@@ -1195,7 +1303,7 @@ name = "pytest-cov"
 version = "6.2.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "coverage" },
+    { name = "coverage", extra = ["toml"] },
     { name = "pluggy" },
     { name = "pytest" },
 ]
@@ -1534,6 +1642,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
 ]
 
+[[package]]
+name = "tomli"
+version = "2.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f", size = 17543, upload-time = "2026-03-25T20:22:03.828Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/11/db3d5885d8528263d8adc260bb2d28ebf1270b96e98f0e0268d32b8d9900/tomli-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f8f0fc26ec2cc2b965b7a3b87cd19c5c6b8c5e5f436b984e85f486d652285c30", size = 154704, upload-time = "2026-03-25T20:21:10.473Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/f7/675db52c7e46064a9aa928885a9b20f4124ecb9bc2e1ce74c9106648d202/tomli-2.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ab97e64ccda8756376892c53a72bd1f964e519c77236368527f758fbc36a53a", size = 149454, upload-time = "2026-03-25T20:21:12.036Z" },
+    { url = "https://files.pythonhosted.org/packages/61/71/81c50943cf953efa35bce7646caab3cf457a7d8c030b27cfb40d7235f9ee/tomli-2.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96481a5786729fd470164b47cdb3e0e58062a496f455ee41b4403be77cb5a076", size = 237561, upload-time = "2026-03-25T20:21:13.098Z" },
+    { url = "https://files.pythonhosted.org/packages/48/c1/f41d9cb618acccca7df82aaf682f9b49013c9397212cb9f53219e3abac37/tomli-2.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a881ab208c0baf688221f8cecc5401bd291d67e38a1ac884d6736cbcd8247e9", size = 243824, upload-time = "2026-03-25T20:21:14.569Z" },
+    { url = "https://files.pythonhosted.org/packages/22/e4/5a816ecdd1f8ca51fb756ef684b90f2780afc52fc67f987e3c61d800a46d/tomli-2.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47149d5bd38761ac8be13a84864bf0b7b70bc051806bc3669ab1cbc56216b23c", size = 242227, upload-time = "2026-03-25T20:21:15.712Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/49/2b2a0ef529aa6eec245d25f0c703e020a73955ad7edf73e7f54ddc608aa5/tomli-2.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec9bfaf3ad2df51ace80688143a6a4ebc09a248f6ff781a9945e51937008fcbc", size = 247859, upload-time = "2026-03-25T20:21:17.001Z" },
+    { url = "https://files.pythonhosted.org/packages/83/bd/6c1a630eaca337e1e78c5903104f831bda934c426f9231429396ce3c3467/tomli-2.4.1-cp311-cp311-win32.whl", hash = "sha256:ff2983983d34813c1aeb0fa89091e76c3a22889ee83ab27c5eeb45100560c049", size = 97204, upload-time = "2026-03-25T20:21:18.079Z" },
+    { url = "https://files.pythonhosted.org/packages/42/59/71461df1a885647e10b6bb7802d0b8e66480c61f3f43079e0dcd315b3954/tomli-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:5ee18d9ebdb417e384b58fe414e8d6af9f4e7a0ae761519fb50f721de398dd4e", size = 108084, upload-time = "2026-03-25T20:21:18.978Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/83/dceca96142499c069475b790e7913b1044c1a4337e700751f48ed723f883/tomli-2.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:c2541745709bad0264b7d4705ad453b76ccd191e64aa6f0fc66b69a293a45ece", size = 95285, upload-time = "2026-03-25T20:21:20.309Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe", size = 14583, upload-time = "2026-03-25T20:22:03.012Z" },
+]
+
 [[package]]
 name = "torch"
 version = "2.6.0"