diff --git a/.gitignore b/.gitignore
index 353dc37..8d2d88b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ __pycache__
 sae_models
 data
 src/env
+
+lightning_logs
\ No newline at end of file
diff --git a/experiments/data_preprocess/dp_utils/__init__.py b/experiments/data_preprocess/dp_utils/__init__.py
new file mode 100644
index 0000000..2cfe908
--- /dev/null
+++ b/experiments/data_preprocess/dp_utils/__init__.py
@@ -0,0 +1 @@
+from .task import *
\ No newline at end of file
diff --git a/experiments/data_preprocess/dp_utils/task.py b/experiments/data_preprocess/dp_utils/task.py
new file mode 100644
index 0000000..e9d3547
--- /dev/null
+++ b/experiments/data_preprocess/dp_utils/task.py
@@ -0,0 +1,5 @@
+from enum import Enum
+
+class Task(Enum):
+  ADD = "ADD"
+  DIFFERENCE = "DIFFERENCE"
\ No newline at end of file
diff --git a/experiments/data_preprocess/main.py b/experiments/data_preprocess/main.py
new file mode 100644
index 0000000..a08be6f
--- /dev/null
+++ b/experiments/data_preprocess/main.py
@@ -0,0 +1,1571 @@
+# %% [markdown]
+"""
+Problem 1:
+Computing the ROUGE score requires both predicted 
+and target texts. However, generating the predicted 
+text involves processing embeddings which, after 
+passing through an untrained linear transformation, 
+can cause the model to produce excessively long 
+outputs. For example, instead of the expected 
+"hello", the model might generate "It is an 
+integral part of the conversation." As a result, 
+processing 16 embeddings takes approximately 10 
+seconds on average.
+"""
+
+# %%
+
+# !pip install "datasets>=3,<4"
+
+# %%
+
+import os
+import sys
+
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+if project_root not in sys.path:
+  print(f"Adding project root to sys.path: {project_root}")
+  sys.path.append(project_root)
+
+# %%
+
+from dp_utils import Task
+
+from argparse import ArgumentParser
+from typing import TypedDict
+
+from datasets import load_from_disk
+from datasets import load_dataset
+
+import random
+from dataclasses import dataclass
+
+from jaxtyping import Float
+
+import torch as t
+import torch.nn as nn
+from torch import Tensor
+from torch import distributed as dist
+import torch.nn.functional as F
+from torch.utils.data import (
+  IterableDataset,
+  DataLoader,
+  DistributedSampler,
+)
+
+from pytorch_lightning import (
+  LightningDataModule,
+  LightningModule,
+  Trainer,
+)
+from pytorch_lightning.strategies import DDPStrategy
+from pytorch_lightning.utilities import rank_zero_info
+from pytorch_lightning.callbacks import ModelCheckpoint
+
+from transformers.modeling_outputs import ModelOutput
+from transformers.optimization import get_linear_schedule_with_warmup
+
+from sonar.inference_pipelines.text import (
+  TextToEmbeddingModelPipeline,
+  EmbeddingToTextModelPipeline,
+)
+
+from fairseq2 import setup_fairseq2
+from fairseq2.data import Collater
+from fairseq2.models.sequence import SequenceBatch
+from fairseq2.nn.padding import get_seqs_and_padding_mask
+from fairseq2.data.text.tokenizers import get_text_tokenizer_hub
+
+import numpy as np
+
+# from torchmetrics.text.rouge import ROUGEScore
+
+# from time import sleep # TODO: remove this later
+
+# %%
+
+print("Setting float32 matmul precision to 'high'...")  # For better performance
+t.set_float32_matmul_precision('high')
+
+# %%
+
+print("Setting up fairseq2...")
+setup_fairseq2()
+
+# %%
+
+class Args(TypedDict):
+  train: bool
+  checkpoint_path: str | None
+
+  data_dir: str
+  default_root_dir: str
+  accelerator: str
+  strategy: str
+  devices: list[int]
+  batch_size: int
+  lr: float
+  num_warmup_steps: int
+  max_steps: int
+  """
+  ```
+  max_steps = (
+    num_samples 
+    // (devices * batch_size) 
+    * max_epochs
+  )
+  ```
+  For example, `( 989944 // (3 * 8) + 989944 // (3 * 8)) * 10 = 824940 steps`
+  """
+
+def parse_args() -> Args:
+  parser = ArgumentParser(description="Data Preprocessing Script")
+  parser.add_argument(
+    "--train",
+    action="store_true",
+    help="Run in training mode.",
+  )
+  parser.add_argument(
+    "--data_dir",
+    type=str,
+    required=True,
+    help="Path to the workspace directory where data will be processed.",
+  )
+  parser.add_argument(
+    "--checkpoint_path",
+    type=str,
+    default=None,
+    help="Path to the model checkpoint for loading.",
+  )
+  parser.add_argument(
+    "--default_root_dir",
+    type=str,
+    help="Default root directory for PyTorch Lightning logs.",
+  )
+  parser.add_argument(
+    "--accelerator",
+    type=str,
+    help="Type of accelerator to use (e.g., 'gpu', 'cpu').",
+  )
+  parser.add_argument(
+    "--strategy",
+    type=str,
+    help="Distributed training strategy (e.g., 'ddp', 'dp').",
+  )
+  parser.add_argument(
+    "--devices",
+    nargs="+",
+    type=int,
+  )
+  parser.add_argument(
+    "--batch_size",
+    type=int,
+    help="Batch size per GPU for training.",
+  )
+  parser.add_argument(
+    "--lr",
+    type=float,
+    help="Learning rate for the optimizer.",
+  )
+  parser.add_argument(
+    "--num_warmup_steps",
+    type=int,
+    help="Number of warmup steps for the learning rate scheduler.",
+  )
+  parser.add_argument(
+    "--max_steps",
+    type=int,
+    help="Total number of training steps.",
+  )
+  args = parser.parse_args()
+  return Args(**vars(args))
+
+def is_notebook():
+  try:
+    __IPYTHON__ # type: ignore
+    return True
+  except NameError:
+    return False
+
+if is_notebook():
+  WORKSPACE_DIR = "/workspace/ALGOVERSE/UJR/jason"
+  DEFAULT_ROOT_DIR = f"{WORKSPACE_DIR}/experiments/data_preprocess"
+  DATA_DIR = f"{WORKSPACE_DIR}/data"
+  sys.argv = [
+    'main.py',
+    # '--train',
+    '--data_dir', DATA_DIR,
+    '--default_root_dir', DEFAULT_ROOT_DIR,
+    '--accelerator', 'gpu',
+    '--strategy', 'auto',
+    '--devices', '2', '3',
+    '--batch_size', '8',
+    '--lr', '1e-4',
+    '--num_warmup_steps', '10',
+    '--max_steps', '20',
+  ]
+
+  CHECKPOINT_PATH = f"{DEFAULT_ROOT_DIR}/lightning_logs/version_1/checkpoints/epoch=0-step=123250-val_loss_epoch=0.7371.ckpt"
+  sys.argv += ['--checkpoint_path', CHECKPOINT_PATH]
+
+args = parse_args()
+
+print("Parsed arguments:")
+for key, value in args.items():
+  print(f"{key}: {value}")
+
+# %%
+
+class DataCollator:
+  def __init__(self, pad_idx: int):
+    self.pad_idx = pad_idx
+
+  def __call__(self, batch):
+    input_a = [t.tensor(item["input_a"]) for item in batch]
+    input_b = [t.tensor(item["input_b"]) for item in batch]
+    input_target = [t.tensor(item["input_target"]) for item in batch]
+
+    # max_len_input_a = max(l.size(0) for l in input_a)
+    # max_len_input_b = max(l.size(0) for l in input_b)
+    # max_len_input_target = max(l.size(0) for l in input_target)
+    # max_len = max(max_len_input_a, max_len_input_b, max_len_input_target)
+    
+    # input_a_padded = []
+    # for seq in input_a:
+    #   if seq.size(0) < max_len:
+    #     pad_amt = max_len - seq.size(0)
+    #     seq = F.pad(seq, (0, pad_amt), value=self.pad_idx)
+    #   input_a_padded.append(seq)
+    # input_a_padded = t.stack(input_a_padded, dim=0)
+
+    # input_b_padded = []
+    # for seq in input_b:
+    #   if seq.size(0) < max_len:
+    #     pad_amt = max_len - seq.size(0)
+    #     seq = F.pad(seq, (0, pad_amt), value=self.pad_idx)
+    #   input_b_padded.append(seq)
+    # input_b_padded = t.stack(input_b_padded, dim=0)
+
+    # input_target_padded = []
+    # for seq in input_target:
+    #   if seq.size(0) < max_len:
+    #     pad_amt = max_len - seq.size(0)
+    #     seq = F.pad(seq, (0, pad_amt), value=self.pad_idx)
+    #   input_target_padded.append(seq)
+    # input_target_padded = t.stack(input_target_padded, dim=0)
+
+    return {
+      "input_a": input_a,
+      "input_b": input_b,
+      "input_target": input_target,
+      # "input_a": input_a_padded,
+      # "input_b": input_b_padded,
+      # "input_target": input_target_padded,
+    }
+
+class TaskDataModule(LightningDataModule):
+  def __init__(
+    self,
+    data_dir: str,
+    task: Task,
+    batch_size: int,
+  ):
+    super().__init__()
+    
+    self.data_dir = data_dir
+    self.task = task
+    self.batch_size = batch_size
+    
+    tokenizer_hub = get_text_tokenizer_hub()
+    self.tokenizer = tokenizer_hub.load(
+      name_or_card="text_sonar_basic_encoder",
+    )
+    self.collator = DataCollator(pad_idx=self.tokenizer.vocab_info.pad_idx)
+
+    self.train_dataset = None
+    self.val_dataset = None
+    self.test_dataset = None
+
+  def prepare_data(self):
+    # download, split, etc...
+    # only called on 1 GPU/TPU in distributed
+    self._prepare_data_wiki_split()
+
+  def setup(self, stage):
+    self._setup_load_from_disk(
+      stage=stage,
+      path="google-research-datasets/wiki_split",
+    )
+  
+  def train_dataloader(self):
+    """
+    Reference:
+    [1] https://lightning.ai/docs/pytorch/stable/common/trainer.html#use-distributed-sampler
+    """
+    sampler = (
+      DistributedSampler(self.train_dataset, shuffle=True)
+      if dist.is_available() and dist.is_initialized()
+      else None
+    )
+    dataloader = TaskDataLoader(
+      task=self.task,
+      sampler=sampler,
+      shuffle=sampler is None,
+      dataset=self.train_dataset,
+      batch_size=self.batch_size,
+      drop_last=True,
+      collate_fn=self.collator,
+      num_workers=os.cpu_count() // 2 if os.cpu_count() > 1 else 1,
+    )
+    return dataloader
+
+  def val_dataloader(self):
+    sampler = (
+      DistributedSampler(self.val_dataset, shuffle=False)
+      if dist.is_available() and dist.is_initialized()
+      else None
+    )
+    dataloader = TaskDataLoader(
+      task=self.task,
+      sampler=sampler,
+      shuffle=sampler is None,
+      dataset=self.val_dataset,
+      batch_size=self.batch_size,
+      drop_last=False,
+      collate_fn=self.collator,
+      num_workers=os.cpu_count() // 2 if os.cpu_count() > 1 else 1,
+    )
+    return dataloader
+  
+  def test_dataloader(self):
+    sampler = (
+      DistributedSampler(self.test_dataset, shuffle=False)
+      if dist.is_available() and dist.is_initialized()
+      else None
+    )
+    dataloader = TaskDataLoader(
+      task=self.task,
+      sampler=sampler,
+      shuffle=sampler is None,
+      dataset=self.test_dataset,
+      batch_size=self.batch_size,
+      drop_last=False,
+      collate_fn=self.collator,
+      num_workers=os.cpu_count() // 2 if os.cpu_count() > 1 else 1,
+    )
+    return dataloader
+  
+  def _prepare_data_wiki_split(self):
+    def preprocess(item):
+      complex_sentence = item['complex_sentence']
+      simple_sentence_1 = item['simple_sentence_1']
+      simple_sentence_2 = item['simple_sentence_2']
+
+      flip = random.random() < 0.5
+
+      match self.task:
+        case Task.ADD:
+          input_a = simple_sentence_1
+          input_b = simple_sentence_2
+          input_target = complex_sentence
+        case Task.DIFFERENCE:
+          input_a = complex_sentence
+          if flip:
+            input_b = simple_sentence_2
+            input_target = simple_sentence_1
+          else:
+            input_b = simple_sentence_1
+            input_target = simple_sentence_2
+        case _:
+          raise ValueError(f"Unknown task: {task}")
+      
+      tokenizer_encoder = self.tokenizer.create_encoder(
+        lang="eng_Latn",
+      )
+      input_a_tokens = tokenizer_encoder(input_a)
+      input_b_tokens = tokenizer_encoder(input_b)
+      input_target_tokens = tokenizer_encoder(input_target)
+
+      return {
+        # "input_a": input_a,
+        # "input_b": input_b,
+        # "input_target": input_target,
+        "input_a": input_a_tokens,
+        "input_b": input_b_tokens,
+        "input_target": input_target_tokens,
+      }
+
+    try:
+      load_from_disk(
+        os.path.join(
+          self.data_dir,
+          "google-research-datasets",
+          "wiki_split",
+          "processed",
+          f"{self.task.value}_train",
+        )
+      )
+      load_from_disk(
+        os.path.join(
+          self.data_dir,
+          "google-research-datasets",
+          "wiki_split",
+          "processed",
+          f"{self.task.value}_val",
+        )
+      )
+      load_from_disk(
+        os.path.join(
+          self.data_dir,
+          "google-research-datasets",
+          "wiki_split",
+          "processed",
+          f"{self.task.value}_test",
+        )
+      )
+      return # TODO: uncomment this later
+    except FileNotFoundError:
+      print(f"Data for task {self.task.value} not found, downloading and processing...")
+    
+    dataset = load_dataset(
+      path="google-research-datasets/wiki_split",
+    )
+
+    train_dataset = dataset['train'].map(
+      preprocess, 
+      remove_columns=dataset['train'].column_names,
+      # batched=True,
+      # batch_size=5,
+    )
+
+    val_dataset = dataset['validation'].map(
+      preprocess, 
+      remove_columns=dataset['validation'].column_names,
+      # batched=True,
+      # batch_size=5,
+    )
+
+    test_dataset = dataset['test'].map(
+      preprocess, 
+      # batched=True,
+      # batch_size=5,
+      remove_columns=dataset['test'].column_names,
+    )
+
+    train_dataset.save_to_disk(
+      os.path.join(
+        self.data_dir, 
+        "google-research-datasets",
+        "wiki_split",
+        "processed",
+        f"{self.task.value}_train",
+      )
+    )
+    val_dataset.save_to_disk(
+      os.path.join(
+        self.data_dir, 
+        "google-research-datasets",
+        "wiki_split",
+        "processed",
+        f"{self.task.value}_val",
+      ),
+    )
+    test_dataset.save_to_disk(
+      os.path.join(
+        self.data_dir, 
+        "google-research-datasets",
+        "wiki_split",
+        "processed",
+        f"{self.task.value}_test",
+      ),
+    )
+
+  def _setup_load_from_disk(
+    self, 
+    stage: str, 
+    path: str,
+  ):
+    if stage == "fit":
+      self.train_dataset = load_from_disk(
+        os.path.join(
+          self.data_dir,
+          path,
+          "processed",
+          f"{self.task.value}_train",
+        )
+      )
+      self.val_dataset = load_from_disk(
+        os.path.join(
+          self.data_dir,
+          path,
+          "processed",
+          f"{self.task.value}_val",
+        )
+      )
+      rank_zero_info(f"Loaded {path} {self.task.value} train dataset with {len(self.train_dataset)} samples.")
+      rank_zero_info(f"Loaded {path} {self.task.value} val dataset with {len(self.val_dataset)} samples.")
+    elif stage == "test":
+      self.test_dataset = load_from_disk(
+        os.path.join(
+          self.data_dir,
+          path,
+          "processed",
+          f"{self.task.value}_test",
+        )
+      )
+      rank_zero_info(f"Loaded {path} {self.task.value} test dataset with {len(self.test_dataset)} samples.")
+    else:
+      raise ValueError(f"Unknown stage: {stage}")
+  
+  def teardown(self, stage):
+    if stage == "fit":
+      self.train_dataset = None
+      self.val_dataset = None
+      rank_zero_info("Teardown: train and val datasets set to None.")
+    elif stage == "test":
+      self.test_dataset = None
+      rank_zero_info("Teardown: test dataset set to None.")
+    else:
+      raise ValueError(f"Unknown stage: {stage}")
+
+class TaskDataLoader(DataLoader):
+  def __init__(self, task: Task, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self.task = task
+
+  def __iter__(self):
+    for batch in super().__iter__():
+      batch["task"] = self.task
+      yield batch
+
+class MultiTaskDataModule(LightningDataModule):
+  def __init__(
+    self, 
+    data_dir: str,
+    tasks: list[Task],
+    batch_size: int,
+  ):
+    super().__init__()
+    self.datasets: dict[Task, TaskDataModule] = {}
+    for task in tasks:
+      self.datasets[task] = TaskDataModule(
+        data_dir=data_dir,
+        task=task,
+        batch_size=batch_size,
+      )
+
+  def prepare_data(self):
+    for dataset in self.datasets.values():
+      dataset.prepare_data()
+  
+  def setup(self, stage):
+    for dataset in self.datasets.values():
+      dataset.setup(stage=stage)
+  
+  def teardown(self, stage):
+    for dataset in self.datasets.values():
+      dataset.teardown(stage=stage)
+
+  def train_dataloader(self):
+    dataset = MultiTaskIterableDataset(
+      dataloader_dict={
+        task: dataset.train_dataloader() 
+        for task, dataset 
+        in self.datasets.items()
+      }
+    )
+    dataloader = DataLoader(
+      dataset=dataset,
+      collate_fn=lambda x: x[0], # Prevent extra tuple wrapping
+    )
+    return dataloader
+  
+  def val_dataloader(self):
+    dataset = MultiTaskIterableDataset(
+      dataloader_dict={
+        task: dataset.val_dataloader() 
+        for task, dataset 
+        in self.datasets.items()
+      }
+    )
+    dataloader = DataLoader(
+      dataset=dataset,
+      collate_fn=lambda x: x[0], # Prevent extra tuple wrapping
+    )
+    return dataloader
+  
+  def test_dataloader(self):
+    dataset = MultiTaskIterableDataset(
+      dataloader_dict={
+        task: dataset.test_dataloader() 
+        for task, dataset 
+        in self.datasets.items()
+      }
+    )
+    dataloader = DataLoader(
+      dataset=dataset,
+      collate_fn=lambda x: x[0], # Prevent extra tuple wrapping
+    )
+    return dataloader
+
+class MultiTaskIterableDataset(IterableDataset):
+  def __init__(
+    self, 
+    dataloader_dict: dict[str, DataLoader],
+  ):
+    super().__init__()
+    self.dataloader_dict = dataloader_dict
+    self.task_list = list(dataloader_dict.keys())
+
+    # self.dataloader_dict = dataloader_dict
+    # self.num_batches_dict = {
+    #   task: len(dataloader)
+    #   for task, dataloader in dataloader_dict.items()
+    # }
+    # self.task_list = list(dataloader_dict.keys())
+    # self.world_size = 1
+    # self.rank = 0
+    # if dist.is_initialized():
+    #   self.world_size = dist.get_world_size()
+    #   self.rank = dist.get_rank()
+
+  # def __len__(self):
+  #   total_batches = sum(self.num_batches_dict.values())
+  #   return total_batches
+
+  def __len__(self):
+    total_batches = sum(len(dl) for dl in self.dataloader_dict.values())
+    return total_batches
+
+  def __iter__(self):
+    dataloader_dicts = {
+      task: iter(dataloader) 
+      for task, dataloader 
+      in self.dataloader_dict.items()
+    }
+    while True:
+      for task in self.task_list:
+        try:
+          yield next(dataloader_dicts[task])
+        except StopIteration:
+          self.task_list.remove(task)
+          if not self.task_list:
+            return
+
+    # all_batches = []
+    # for dataloader in self.dataloader_dict.values():
+    #   all_batches.extend(iter(dataloader))
+
+    # random.shuffle(all_batches)
+
+    # for i in range(0, len(all_batches)):
+    #   yield all_batches[i]
+
+class Add(nn.Module):
+  def __init__(self, n_embd: int):
+    super().__init__()
+    self.linear = nn.Linear(2*n_embd, n_embd)
+
+  def forward(
+    self, 
+    x: Float[Tensor, "batch_size 2*n_embd"],
+  ):
+    x = self.linear(x)
+    return x
+
+class Difference(nn.Module):
+  def __init__(self, n_embd: int):
+    super().__init__()
+    self.linear = nn.Linear(2*n_embd, n_embd)
+
+  def forward(
+    self, 
+    x: Float[Tensor, "batch_size 2*n_embd"],
+  ):
+    x = self.linear(x)
+    return x
+
+@dataclass
+class EncoderModelOutput(ModelOutput):
+  embeddings_sim: Float[Tensor, "batch_size n_embd"]
+  embeddings_pos: Float[Tensor, "batch_size n_embd"] | None = None
+  # target_seqs: list[str] | None = None
+
+class EncoderModel(nn.Module):
+  def __init__(
+    self,
+  ):
+    super().__init__()
+    self.encoder = TextToEmbeddingModelPipeline(
+      tokenizer="text_sonar_basic_encoder",
+      encoder="text_sonar_basic_encoder",
+    )
+    
+    self.n_embd = 1024
+
+    # self.norm1 = nn.RMSNorm(self.n_embd)
+    self.add = Add(n_embd=self.n_embd)
+    self.difference = Difference(n_embd=self.n_embd)
+    # self.norm2 = nn.RMSNorm(self.n_embd)
+
+    for param in self.encoder.model.parameters():
+      param.requires_grad = False
+
+  def forward(
+    self, 
+    task: Task,
+    input_a: list[Float[Tensor, "seq_len"]],
+    input_b: list[Float[Tensor, "seq_len"]], 
+    input_target: list[Float[Tensor, "seq_len"]] | None = None,
+  ):
+    # Version 1: input_a, input_b, input_target are list[str]
+    # input = input_a + input_b
+    # if input_target is not None:
+    #   input += input_target
+
+    # device = next(self.encoder.model.parameters()).device
+    # tokenizer_encoder = self.encoder.tokenizer.create_encoder(
+    #   lang="eng_Latn",
+    #   device=device,
+    # )
+
+    # seqs = [
+    #   tokenizer_encoder(seq)
+    #   for seq in input
+    # ]
+    # collater = Collater(
+    #   pad_value=self.encoder.tokenizer.vocab_info.pad_idx,
+    # )
+    # batch = collater(seqs)
+    # tokens, padding_mask = get_seqs_and_padding_mask(
+    #   data=batch, 
+    #   device=device,
+    # )
+
+    # Version 2: input_a, input_b, input_target are list[Float[Tensor, "seq_len"]]
+    seqs = input_a + input_b
+    if input_target is not None:
+      seqs += input_target
+    
+    collater = Collater(
+      pad_value=self.encoder.tokenizer.vocab_info.pad_idx,
+    )
+    batch = collater(seqs)
+    device = next(self.encoder.model.parameters()).device
+    tokens, padding_mask = get_seqs_and_padding_mask(
+      data=batch, 
+      device=device,
+    )
+
+    sequence_batch = SequenceBatch(tokens, padding_mask)
+    embeddings = self.encoder.model(sequence_batch).sentence_embeddings
+
+    # embeddings = self.norm1(embeddings)
+
+    batch_size = len(input_a)
+    embeddings_a = embeddings[0:batch_size]
+    embeddings_b = embeddings[batch_size:2*batch_size]
+    embeddings_target = embeddings[2*batch_size:] if input_target is not None else None
+
+    embeddings_a_b = t.cat([embeddings_a, embeddings_b], dim=1)
+
+    match task:
+      case Task.ADD:
+        embeddings_sim = self.add(embeddings_a_b)
+      case Task.DIFFERENCE:
+        embeddings_sim = self.difference(embeddings_a_b)
+      case _:
+        raise ValueError(f"Unknown task: {task}")
+
+    # embeddings_sim = self.norm2(embeddings_sim)
+    embeddings_pos = embeddings_target
+
+    return EncoderModelOutput(
+      embeddings_sim=embeddings_sim,
+      embeddings_pos=embeddings_pos,
+      # target_seqs=seqs[2*batch_size:] if input_target is not None else None,
+    )
+
+@dataclass
+class EncoderDecoderModelOutput(ModelOutput):
+  embeddings_sim: Float[Tensor, "batch_size n_embd"]
+  contrastive_loss: Float[Tensor, ""] | None = None
+  reconstruction_loss: Float[Tensor, ""] | None = None
+  loss: Float[Tensor, ""] | None = None
+  embeddings_pos: Float[Tensor, "batch_size n_embd"] | None = None
+  # decoded_outputs: list[str] | None = None
+  # decoded_targets: list[str] | None = None
+
+class EncoderDecoderModel(nn.Module):
+  def __init__(
+    self,
+  ):
+    super().__init__()
+    self.encoder = EncoderModel()
+    self.decoder = EmbeddingToTextModelPipeline(
+      tokenizer="text_sonar_basic_decoder",
+      decoder="text_sonar_basic_decoder",
+    )
+
+    for param in self.decoder.model.parameters():
+      param.requires_grad = False
+
+  def forward(
+    self, 
+    task: Task,
+    input_a: list[Float[Tensor, "seq_len"]], 
+    input_b: list[Float[Tensor, "seq_len"]], 
+    input_target: list[Float[Tensor, "seq_len"]] | None = None,
+    # return_decoded: bool = False,
+  ):
+    encoder_output = self.encoder(
+      task=task,
+      input_a=input_a,
+      input_b=input_b,
+      input_target=input_target,
+    )
+    
+    embeddings_sim = encoder_output.embeddings_sim # shape: (batch_size, 2, n_embd)
+    embeddings_pos = encoder_output.embeddings_pos
+    # target_seqs = encoder_output.target_seqs
+
+    output_kwargs = {
+      "embeddings_sim": embeddings_sim,
+    }
+
+    # if return_decoded:
+    #   batch_size = embeddings_sim.shape[0]
+    #   inputs = (
+    #     t.cat(
+    #       [embeddings_sim, embeddings_pos], 
+    #       dim=0,
+    #     )
+    #     if embeddings_pos is not None 
+    #     else embeddings_sim
+    #   )
+    #   decoded = self.decoder.predict(
+    #     inputs=inputs,
+    #     target_lang="eng_Latn",
+    #     max_seq_len=512,
+    #   )
+    #   decoded_outputs = decoded[:batch_size]
+    #   decoded_targets = (
+    #     decoded[batch_size:] 
+    #     if embeddings_pos is not None 
+    #     else None
+    #   )
+    #   output_kwargs.update({
+    #     "decoded_outputs": decoded_outputs,
+    #     "decoded_targets": decoded_targets,
+    #   })
+
+    if input_target is not None:
+      cl_loss = self.compute_contrastive_loss(
+        embeddings_sim=embeddings_sim, 
+        embeddings_pos=embeddings_pos,
+      )
+      gen_loss = self.compute_reconstruction_loss(
+        input_target=input_target,
+        embeddings_sim=embeddings_sim,
+      )
+      batch_size = embeddings_sim.shape[0]
+      gen_loss_weight = -np.log(1/batch_size) / -np.log(1/self.decoder.tokenizer.vocab_info.size)
+      # cl_loss random loss is -ln(1/batch_size) and
+      # gen_loss random loss is -ln(1/vocab_size)
+      loss = cl_loss + gen_loss * gen_loss_weight
+      output_kwargs.update({
+        "contrastive_loss": cl_loss,
+        "reconstruction_loss": gen_loss,
+        "loss": loss,
+        "embeddings_pos": embeddings_pos,
+      })
+
+    return EncoderDecoderModelOutput(**output_kwargs)
+  
+  def compute_contrastive_loss(
+    self, 
+    embeddings_sim: Float[Tensor, "batch_size n_embd"], 
+    embeddings_pos: Float[Tensor, "batch_size n_embd"]
+  ):
+    # Contrastive learning loss
+    sim_fct = nn.CosineSimilarity(dim=-1)
+    temperature = 0.05
+    cos_sim = sim_fct(
+      embeddings_sim.unsqueeze(dim=1), # shape: (batch_size, 1, n_embd)
+      embeddings_pos.unsqueeze(dim=0), # shape: (1, batch_size, n_embd)
+    ) / temperature
+    batch_size = embeddings_sim.shape[0]
+    cl_labels = t.arange(batch_size, device=cos_sim.device)
+    loss_fct = nn.CrossEntropyLoss()
+    cl_loss = loss_fct(cos_sim, cl_labels)
+    return cl_loss
+
+  def compute_reconstruction_loss(
+    self, 
+    input_target: list[Float[Tensor, "seq_len"]],
+    embeddings_sim: Float[Tensor, "batch_size n_embd"],
+  ):
+    # Generation loss
+    loss_fct = nn.CrossEntropyLoss()
+
+    device = next(self.decoder.model.parameters()).device
+    input_to_decoder = [
+      t.cat([
+        t.tensor(
+          [
+            self.decoder.tokenizer.vocab_info.eos_idx,
+          ],
+          device=device,
+        ),
+        seq[:-1], # Remove last token (EOS)
+        # seq[1:-1], # Remove first and last token (language token and EOS)
+      ])
+      for seq in input_target
+    ]
+    collater = Collater(
+      pad_value=self.decoder.tokenizer.vocab_info.pad_idx,
+    )
+    input_data = collater(input_to_decoder)
+    input_tensor, input_padding_mask = get_seqs_and_padding_mask(
+      data=input_data, 
+      device=device,
+    )
+    decoder_output, decoder_padding_mask = self.decoder.model.decode(
+      seqs=input_tensor,
+      padding_mask=input_padding_mask,
+      encoder_output=embeddings_sim.unsqueeze(1),
+      encoder_padding_mask=None,
+    )
+
+    model_output = self.decoder.model.project(
+      decoder_output=decoder_output, 
+      decoder_padding_mask=decoder_padding_mask,
+    )
+
+    logits = model_output.logits
+
+    labels = input_target
+    # labels = [
+    #   seq
+    #   # seq[1:] # Remove first token (language token)
+    #   for seq in input_target
+    # ]
+
+    max_len_logits = logits.size(1)
+    max_len_labels = max(l.size(0) for l in labels)
+    max_len = max(max_len_logits, max_len_labels)
+
+    logits_padded = logits
+    if logits.size(1) < max_len:
+      pad_amt = max_len - logits.size(1)
+      logits_padded = F.pad(
+        input=logits, 
+        pad=(0, 0, 0, pad_amt), 
+        value=self.decoder.tokenizer.vocab_info.pad_idx,
+      )
+    
+    labels_padded = []
+    for label in labels:
+      if label.size(0) < max_len:
+        pad_amt = max_len - label.size(0)
+        label = F.pad(
+          input=label, 
+          pad=(0, pad_amt), 
+          value=self.decoder.tokenizer.vocab_info.pad_idx,
+        )
+      labels_padded.append(label)
+    labels_padded = t.stack(labels_padded, dim=0)
+
+    gen_loss = loss_fct(
+      logits_padded.reshape(-1, logits_padded.size(-1)), # (batch_size * seq_len, vocab_size)
+      labels_padded.reshape(-1)                          # (batch_size * seq_len)
+    )
+    return gen_loss
+
+class LitModel(LightningModule):
+  def __init__(
+    self,
+    lr: float,
+    num_warmup_steps: int,
+    num_training_steps: int,
+  ):
+    super().__init__()
+    self.save_hyperparameters()
+
+    self.lr = lr
+    self.num_warmup_steps = num_warmup_steps
+    self.num_training_steps = num_training_steps
+
+    self.model = EncoderDecoderModel()
+    
+    # self.dummy_parameter = nn.Parameter(t.tensor(0.0, requires_grad=True))
+
+    self.strict_loading = False
+    # This is a hack which is required
+    # because checkpoint['state_dict']
+    # is modified in on_save_checkpoint
+    # to reduce checkpoint file size.
+    # In the future, instead of modifying
+    # checkpoint['state_dict'], we can
+    # pass the encoder and decoder
+    # in the argument, and use
+    # self.save_hyperparameters(ignore)
+
+  def on_fit_start(self):
+    self.fix_device()
+
+  def on_validation_start(self):
+    self.fix_device()
+
+  def on_test_start(self):
+    self.fix_device()
+
+  def fix_device(self):
+    device = next(self.model.decoder.model.parameters()).device
+    self.model.decoder.device = device
+    # This is a hack which is required
+    # because EmbeddngToTextModelPipeline
+    # sets the device in the constructor,
+    # but we need to set it after the model
+    # is moved to the correct device.
+
+  def get_trainable_state_dict(self):
+    state = {}
+    for name, param in self.named_parameters():
+      if param.requires_grad:
+        state[name] = param.data.cpu()
+    return state
+
+  def on_save_checkpoint(self, checkpoint):
+    checkpoint['state_dict'] = self.get_trainable_state_dict()
+    # def print_keys(d, indent=0):
+    #   if isinstance(d, dict):
+    #     for key, value in d.items():
+    #       rank_zero_info(f"{' ' * indent}Key: {key}")
+    #       print_keys(value, indent=indent+1)
+    # print_keys(checkpoint)
+
+  def training_step(self, batch):
+    outputs = self.model(
+      task=batch['task'],
+      input_a=batch['input_a'],
+      input_b=batch['input_b'],
+      input_target=batch['input_target'],
+    )
+    
+    cl_loss = outputs.contrastive_loss
+    gen_loss = outputs.reconstruction_loss
+    loss = outputs.loss
+
+    batch_size = len(batch['input_a'])
+    lr = self.trainer.optimizers[0].param_groups[0]['lr']
+    self.log(
+      "lr", 
+      lr, 
+      prog_bar=True, 
+      logger=True,
+      on_step=True,
+      on_epoch=False,
+    )
+    self.log(
+      "train_cl_loss", 
+      cl_loss, 
+      prog_bar=True, 
+      logger=True,
+      on_step=True,
+      on_epoch=False,
+      sync_dist=True,
+      batch_size=batch_size,
+    )
+    self.log(
+      "train_gen_loss", 
+      gen_loss, 
+      prog_bar=True, 
+      logger=True,
+      on_step=True,
+      on_epoch=False,
+      sync_dist=True,
+      batch_size=batch_size,
+    )
+    self.log(
+      "train_loss", 
+      loss, 
+      prog_bar=True, 
+      logger=True,
+      on_step=True,
+      on_epoch=False,
+      sync_dist=True,
+      batch_size=batch_size,
+    )
+    return loss
+    
+    # print(
+    #   f"[GPU {self.global_rank}] "
+    #   f"batch_idx: {batch_idx}, "
+    #   f"task: {batch['task']}, "
+    #   f"input_a: {batch['input_a']}, "
+    #   f"input_a: {batch['input_a'][0][:30]}, "
+    #   f"input_b: {batch['input_b'][0][:30]}, "
+    #   f"input_target: {batch['input_target'][0][:30]}, "
+    # )
+    # print(
+    #   f"[GPU {self.global_rank}] "
+    #   f"batch_idx: {batch_idx}, "
+    #   f"task: {batch['task']}, "
+    #   f"input_a: {batch['input_a'][1][:30]}, "
+    #   f"input_b: {batch['input_b'][1][:30]}, "
+    #   f"input_target: {batch['input_target'][1][:30]}, "
+    # )
+    # sleep(0.1)
+    # return t.tensor(0.0, requires_grad=True)
+  
+  def vec2text(
+    self,
+    embeddings: Float[Tensor, "batch_size vocab_size"],
+  ):
+    texts = self.model.decoder.predict(
+      inputs=embeddings,
+      target_lang="eng_Latn",
+      max_seq_len=512,
+    )
+    return texts
+
+  def validation_step(self, batch, batch_idx):
+    # self.validation_step_outputs = {
+    #   "contrastive_loss": [],
+    #   "reconstruction_loss": [],
+    #   "loss": [],
+    #   # "decoded_targets": [],
+    #   # "decoded_outputs": [],
+    # }
+
+    outputs = self.model(
+      task=batch['task'],
+      input_a=batch['input_a'],
+      input_b=batch['input_b'],
+      input_target=batch['input_target'],
+      # return_decoded=True,
+    )
+    embeddings_sim = outputs.embeddings_sim
+    embeddings_pos = outputs.embeddings_pos
+    cl_loss = outputs.contrastive_loss
+    gen_loss = outputs.reconstruction_loss
+    loss = outputs.loss
+    # decoded_outputs = outputs.decoded_outputs
+    # decoded_targets = outputs.decoded_targets
+
+    # self.validation_step_outputs["contrastive_loss"].append(cl_loss)
+    # self.validation_step_outputs["reconstruction_loss"].append(gen_loss)
+    # self.validation_step_outputs["loss"].append(loss)
+    # self.validation_step_outputs["decoded_outputs"].extend(decoded_outputs)
+    # self.validation_step_outputs["decoded_targets"].extend(decoded_targets)
+
+    batch_size = len(batch['input_a'])
+    self.log(
+      "val_cl_loss_epoch", 
+      cl_loss,
+      logger=True,
+      on_step=False,
+      on_epoch=True,
+      sync_dist=True,
+      batch_size=batch_size,
+    )
+    self.log(
+      "val_gen_loss_epoch", 
+      gen_loss,
+      logger=True,
+      on_step=False,
+      on_epoch=True,
+      sync_dist=True,
+      batch_size=batch_size,
+    )
+    self.log(
+      "val_loss_epoch", 
+      loss,
+      logger=True,
+      on_step=False,
+      on_epoch=True,
+      sync_dist=True,
+      batch_size=batch_size,
+    )
+
+    if batch_idx == 0 and self.global_rank == 0:
+      self.log_examples(
+        tag="val_examples",
+        input_a=batch['input_a'],
+        input_b=batch['input_b'],
+        task=batch['task'],
+        embeddings_prediction=embeddings_sim,
+        embeddings_target=embeddings_pos,
+      )
+
+    return loss
+
+  # def on_validation_epoch_end(self):
+  #   rank_zero_info("Validation epoch ended, calculating metrics...")
+
+  #   outputs = self.validation_step_outputs
+  #   contrastive_loss = t.stack(outputs["contrastive_loss"]).mean()
+  #   reconstruction_loss = t.stack(outputs["reconstruction_loss"]).mean()
+  #   loss = t.stack(outputs["loss"]).mean()
+  #   # decoded_outputs = outputs["decoded_outputs"]
+  #   # decoded_targets = outputs["decoded_targets"]
+
+  #   # rouge = ROUGEScore()
+  #   # rouge_score = rouge(
+  #   #   preds=decoded_outputs, 
+  #   #   target=decoded_targets,
+  #   # )
+  #   # rouge1_fmeasure = rouge_score["rouge1_fmeasure"].to(device=self.device)
+  #   # rouge2_fmeasure = rouge_score["rouge2_fmeasure"].to(device=self.device)
+  #   # rougeL_fmeasure = rouge_score["rougeL_fmeasure"].to(device=self.device)
+
+  #   self.log(
+  #     "val_cl_loss_epoch", 
+  #     contrastive_loss,
+  #     logger=True,
+  #     on_epoch=True,
+  #     sync_dist=True,
+  #   )
+  #   self.log(
+  #     "val_gen_loss_epoch", 
+  #     reconstruction_loss,
+  #     logger=True,
+  #     on_epoch=True,
+  #     sync_dist=True,
+  #   )
+  #   self.log(
+  #     "val_loss_epoch", 
+  #     loss,
+  #     logger=True,
+  #     on_epoch=True,
+  #     sync_dist=True,
+  #   )
+
+  #   # self.log(
+  #   #   "val_rouge1", 
+  #   #   rouge1_fmeasure,
+  #   #   logger=True,
+  #   #   sync_dist=True,
+  #   # )
+  #   # self.log(
+  #   #   "val_rouge2", 
+  #   #   rouge2_fmeasure,
+  #   #   logger=True,
+  #   #   sync_dist=True,
+  #   # )
+  #   # self.log(
+  #   #   "val_rougeL", 
+  #   #   rougeL_fmeasure,
+  #   #   logger=True,
+  #   #   sync_dist=True,
+  #   # )
+
+  #   self.validation_step_outputs.clear()
+
+  def test_step(self, batch, batch_idx):
+    # self.test_step_outputs = {
+    #   "contrastive_loss": [],
+    #   "reconstruction_loss": [],
+    #   "loss": [],
+    #   # "decoded_targets": [],
+    #   # "decoded_outputs": [],
+    # }
+
+    outputs = self.model(
+      task=batch['task'],
+      input_a=batch['input_a'],
+      input_b=batch['input_b'],
+      input_target=batch['input_target'],
+      # return_decoded=True,
+    )
+    embeddings_sim = outputs.embeddings_sim
+    embeddings_pos = outputs.embeddings_pos
+    cl_loss = outputs.contrastive_loss
+    gen_loss = outputs.reconstruction_loss
+    loss = outputs.loss
+    # decoded_outputs = outputs.decoded_outputs
+    # decoded_targets = outputs.decoded_targets
+
+    # self.test_step_outputs["contrastive_loss"].append(cl_loss)
+    # self.test_step_outputs["reconstruction_loss"].append(gen_loss)
+    # self.test_step_outputs["loss"].append(loss)
+    # self.test_step_outputs["decoded_outputs"].extend(decoded_outputs)
+    # self.test_step_outputs["decoded_targets"].extend(decoded_targets)
+
+    batch_size = len(batch['input_a'])
+    self.log(
+      "test_cl_loss_epoch", 
+      cl_loss,
+      logger=True,
+      on_step=False,
+      on_epoch=True,
+      sync_dist=True,
+      batch_size=batch_size,
+    )
+    self.log(
+      "test_gen_loss_epoch", 
+      gen_loss,
+      logger=True,
+      on_step=False,
+      on_epoch=True,
+      sync_dist=True,
+      batch_size=batch_size,
+    )
+    self.log(
+      "test_loss_epoch", 
+      loss,
+      logger=True,
+      on_step=False,
+      on_epoch=True,
+      sync_dist=True,
+      batch_size=batch_size,
+    )
+
+    if batch_idx == 0 and self.global_rank == 0:
+      self.log_examples(
+        tag="test_examples",
+        input_a=batch['input_a'],
+        input_b=batch['input_b'],
+        task=batch['task'],
+        embeddings_prediction=embeddings_sim,
+        embeddings_target=embeddings_pos,
+      )
+    
+    return loss
+
+  def log_examples(
+    self,
+    tag: str,
+    input_a: list[Float[Tensor, "seq_len"]],
+    input_b: list[Float[Tensor, "seq_len"]],
+    task: Task,
+    embeddings_prediction: Float[Tensor, "batch_size n_embd"],
+    embeddings_target: Float[Tensor, "batch_size n_embd"] | None = None,
+  ):
+    """
+    Log some examples to TensorBoard
+    """
+    text_decoder = self.model.decoder.tokenizer.create_decoder()
+    input_a = [
+      text_decoder(seq)
+      for seq in input_a
+    ]
+    input_b = [
+      text_decoder(seq)
+      for seq in input_b
+    ]
+
+    embeddings = t.cat(
+      [embeddings_prediction, embeddings_target], 
+      dim=0,
+    )
+    texts = self.vec2text(
+      embeddings=embeddings,
+    )
+    batch_size = len(input_a)
+    prediction = texts[:batch_size]
+    target = texts[batch_size:]
+
+    tasks = [task.value] * batch_size
+    markdown_text = self.create_markdown_table(
+      input_a=input_a,
+      input_b=input_b,
+      tasks=tasks,
+      prediction=prediction,
+      target=target,
+    )
+
+    tensorboard = self.logger.experiment
+    tensorboard.add_text(
+      tag=tag,
+      text_string=markdown_text,
+      global_step=self.global_step,
+    )
+
+  def create_markdown_table(
+    self,
+    input_a: list[str],
+    input_b: list[str],
+    tasks: list[str],
+    prediction: list[str],
+    target: list[str],
+  ):
+    batch_size = len(input_a)
+    rows = [
+      "| input_a | input_b | task | prediction | target |",
+      "| ------- | ------- | ---- | ---------- | ------ |",
+    ]
+    for i in range(batch_size):
+      rows.append(
+        f"| {input_a[i]} |"
+        f" {input_b[i]} |"
+        f" {tasks[i]} |"
+        f" {prediction[i]} |"
+        f" {target[i]} |"
+      )
+    markdown_text = "\n".join(rows)
+    return markdown_text
+    
+
+  # def on_test_epoch_end(self):
+  #   rank_zero_info("Test epoch ended, calculating metrics...")
+
+  #   outputs = self.test_step_outputs
+  #   contrastive_loss = t.stack(outputs["contrastive_loss"]).mean()
+  #   reconstruction_loss = t.stack(outputs["reconstruction_loss"]).mean()
+  #   loss = t.stack(outputs["loss"]).mean()
+  #   # decoded_outputs = outputs["decoded_outputs"]
+  #   # decoded_targets = outputs["decoded_targets"]
+
+  #   # rouge = ROUGEScore()
+  #   # rouge_score = rouge(
+  #   #   preds=decoded_outputs, 
+  #   #   target=decoded_targets,
+  #   # )
+  #   # rouge1_fmeasure = rouge_score["rouge1_fmeasure"].to(device=self.device)
+  #   # rouge2_fmeasure = rouge_score["rouge2_fmeasure"].to(device=self.device)
+  #   # rougeL_fmeasure = rouge_score["rougeL_fmeasure"].to(device=self.device)
+
+  #   self.log(
+  #     "test_cl_loss_epoch", 
+  #     contrastive_loss,
+  #     logger=True,
+  #     sync_dist=True,
+  #   )
+  #   self.log(
+  #     "test_gen_loss_epoch", 
+  #     reconstruction_loss,
+  #     logger=True,
+  #     sync_dist=True,
+  #   )
+  #   self.log(
+  #     "test_loss_epoch", 
+  #     loss,
+  #     logger=True,
+  #     sync_dist=True,
+  #   )
+
+  #   # self.log(
+  #   #   "test_rouge1", 
+  #   #   rouge1_fmeasure,
+  #   #   logger=True,
+  #   #   sync_dist=True,
+  #   # )
+  #   # self.log(
+  #   #   "test_rouge2", 
+  #   #   rouge2_fmeasure,
+  #   #   logger=True,
+  #   #   sync_dist=True,
+  #   # )
+  #   # self.log(
+  #   #   "test_rougeL",
+  #   #   rougeL_fmeasure,
+  #   #   logger=True,
+  #   #   sync_dist=True,
+  #   # )
+
+  #   self.test_step_outputs.clear()  
+  
+  def configure_optimizers(self):
+    param_dict = {
+      pn: p 
+      for pn, p 
+      in self.named_parameters() 
+      if p.requires_grad
+    }
+    decay_params = [
+      p for _, p in param_dict.items()
+      if p.dim() >= 2
+    ]
+    nodecay_params = [
+      p for _, p in param_dict.items()
+      if p.dim() < 2
+    ]
+    params = [
+      {'params': decay_params, 'weight_decay': 0.01},
+      {'params': nodecay_params, 'weight_decay': 0.0},
+    ]
+
+    optimizer = t.optim.AdamW(
+      params=params, 
+      lr=self.lr,
+    )
+    scheduler = get_linear_schedule_with_warmup(
+      optimizer,
+      num_warmup_steps=self.num_warmup_steps,
+      num_training_steps=self.num_training_steps,
+    )
+    return {
+      "optimizer": optimizer,
+      "lr_scheduler": {
+        "scheduler": scheduler,
+        "interval": "step",
+        "frequency": 1,
+      }
+    }
+
+# datamodule = TaskDataModule(
+#   data_dir=args['data_dir'],
+#   task=Task.ADD,
+#   batch_size=args['batch_size'],
+# )
+
+datamodule = MultiTaskDataModule(
+  data_dir=args['data_dir'],
+  tasks=[Task.ADD, Task.DIFFERENCE],
+  batch_size=args['batch_size'],
+)
+
+# %%
+
+checkpoint_callback = ModelCheckpoint(
+  # dirpath=f"{args['default_root_dir']}/checkpoints",
+  filename="{epoch}-{step}-{val_loss_epoch:.4f}",
+  monitor="val_loss_epoch",
+  mode="min",
+  save_top_k=1,
+)
+
+if args["strategy"] == "ddp":
+  strategy = DDPStrategy(find_unused_parameters=True)
+elif args["strategy"] is not None:
+  strategy = args["strategy"]
+else:
+  strategy = "auto"
+
+trainer = Trainer(
+  accelerator=args['accelerator'],
+  strategy=strategy,
+  devices=args['devices'],
+  precision="bf16-mixed",
+  callbacks=[checkpoint_callback],
+  max_steps=args['max_steps'],
+  # val_check_interval=args['max_steps'],
+  val_check_interval=250,
+  enable_checkpointing=True,
+  # profiler="simple",
+  default_root_dir=args['default_root_dir'],
+  # log_every_n_steps=1,
+)
+
+model = LitModel(
+  lr=args['lr'],
+  num_warmup_steps=args['num_warmup_steps'],
+  num_training_steps=args['max_steps'],
+)
+
+if args['train']:
+  trainer.fit(
+    model=model, 
+    datamodule=datamodule,
+    ckpt_path=args['checkpoint_path'],
+  )
+
+trainer.test(
+  model=model, 
+  datamodule=datamodule,
+  ckpt_path=args['checkpoint_path'],
+)
+
+# %%
diff --git a/experiments/data_preprocess/main.sh b/experiments/data_preprocess/main.sh
new file mode 100644
index 0000000..6b7a1d3
--- /dev/null
+++ b/experiments/data_preprocess/main.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+CMD="
+python main.py \
+  --data_dir $DATA_DIR \
+  --default_root_dir $DEFAULT_ROOT_DIR \
+  --accelerator gpu \
+  --strategy ddp \
+  --devices 2 3 \
+  --batch_size 8 \
+  --lr 1e-4 \
+  --num_warmup_steps 1237 \
+  --max_steps 123742 \
+"
+
+if [ -n "$TRAIN" ]; then
+  CMD="$CMD --train"
+fi
+
+if [ -n "$CHECKPOINT_PATH" ]; then
+  CMD="$CMD --checkpoint_path $CHECKPOINT_PATH"
+fi
+
+eval $CMD
\ No newline at end of file
diff --git a/notes/gen_loss.py b/notes/gen_loss.py
new file mode 100644
index 0000000..c5beb82
--- /dev/null
+++ b/notes/gen_loss.py
@@ -0,0 +1,481 @@
+# %% [markdown]
+"""
+Goals:
+- How to compute contrastive learning loss for text embeddings
+- How to compute generation loss for text generation models
+
+Notes:
+- The contrastive learning loss is penalizing the model for generating embeddings
+  that is not similar to the target embeddings and similar to other embeddings.
+
+  However, I am not sure whether this is the right way to penalize the model.
+  I am not sure whether the model should be penalized for generating embeddings
+  that are similar to other embeddings
+
+  Also, a cosine similarity range values are between -1 and 1, so the loss
+  will never be 0, even if the positive embeddings are identical and the
+  negative embeddings are very different. The solution is to scale the 
+  cosine similarity by a temperature value, to exaggerate the differences
+  between the correct and incorrect embeddings.
+
+  random guess loss = -ln(1/batch_size)
+
+- The generation loss might be miscalculated.
+  
+  The way `fairseq2.generation.BeamSearchSeq2SeqGenerator` works is not straightforward.
+  It computes the logits per chunk of 1, and then compute the scores per chunk.
+  Then, do beam search, select the token, and repeats until `max_seq_len` is reached.
+  Lastly, it sort the hypotheses by their scores. In other words, I am not sure 
+  whether we can decode it in one forward pass and compute the loss in one forward pass.
+
+  So far, reconstructing "hello world" will result in loss of 4.53, even though
+  the model predicts the correct token. The solution is to scale the loss
+  by 0.01, so that it does not dominate the contrastive learning loss.
+
+  random guess loss = -ln(1/vocab_size)
+"""
+
+# %%
+
+from sonar.inference_pipelines.text import (
+  TextToEmbeddingModelPipeline,
+  EmbeddingToTextModelPipeline,
+)
+
+from fairseq2.nn.padding import get_seqs_and_padding_mask
+from fairseq2.data import Collater
+from fairseq2.models.sequence import SequenceBatch
+from fairseq2.generation import BeamSearchSeq2SeqGenerator
+
+import torch as t
+from torch import nn
+import torch.nn.functional as F
+
+from torchmetrics.text import ROUGEScore
+
+# %% TODO: remove this later
+
+text2vec_model = TextToEmbeddingModelPipeline(
+  tokenizer="text_sonar_basic_encoder",
+  encoder="text_sonar_basic_encoder",
+)
+
+vec2text_model = EmbeddingToTextModelPipeline(
+  tokenizer="text_sonar_basic_decoder",
+  decoder="text_sonar_basic_decoder",
+)
+
+# %%
+
+print("Reproducing text2vec_model.predict() behavior...")
+max_seq_len = 512
+tokenizer_encoder = text2vec_model.tokenizer.create_encoder(lang="eng_Latn")
+sentences = [
+  "hello world",
+  "hello world",
+  "hello world",
+  # "hello",
+  # "hello world. my name is jeff",
+]
+seqs = [
+  tokenizer_encoder(sentence)
+  for sentence in sentences
+]
+seqs = [
+  seq[:max_seq_len] 
+  if max_seq_len is not None 
+  else seq
+  for seq in seqs
+]
+collater = Collater(pad_value=text2vec_model.tokenizer.vocab_info.pad_idx)
+batch = collater(seqs)
+tokens, padding_mask = get_seqs_and_padding_mask(data=batch)
+sequence_batch = SequenceBatch(tokens, padding_mask)
+embeddings = text2vec_model.model(sequence_batch).sentence_embeddings
+
+print(f"embeddings:\n{embeddings}")
+
+# %%
+
+print("Reproducing vec2text_model.predict() behavior...")
+
+target_text_encoder = vec2text_model.tokenizer.create_encoder(
+  task="translation",
+  lang="eng_Latn",
+  mode="target",
+)
+target_prefix_seqs = target_text_encoder.prefix_indices
+text_decoder = vec2text_model.tokenizer.create_decoder()
+
+batch_size = len(embeddings)
+
+generator = BeamSearchSeq2SeqGenerator(
+  model=vec2text_model.model,
+  max_seq_len=max_seq_len,
+)
+
+generator_output = generator(
+  embeddings,
+  None,
+  target_prefix_seqs.expand(batch_size, -1),
+  None
+)
+
+texts: list[str] = []
+for idx, hypotheses in enumerate(generator_output.hypotheses):
+  texts.append(text_decoder(hypotheses[0].seq))
+print("texts:", texts)
+
+# %%
+
+print("Get logits")
+
+# One by one
+input_to_decoder = []
+logits = []
+for idx, hypotheses in enumerate(generator_output.hypotheses):
+  # seq = t.cat(
+  #   [
+  #     t.tensor([text2vec_model.tokenizer.vocab_info.eos_idx]), 
+  #     hypotheses[0].seq[:-1],
+  #   ],
+  # )
+  seq = t.cat(
+    [
+      t.tensor(
+        [
+          text2vec_model.tokenizer.vocab_info.eos_idx,
+        ]
+      ),
+      seqs[idx][:-1],
+      # seqs[idx][1:-1], # Remove first and last token (language token and EOS)
+    ],
+  )
+  input_to_decoder.append(seq)
+  decoder_output, decoder_padding_mask = vec2text_model.model.decode(
+    seqs=seq.unsqueeze(0),
+    padding_mask=None,
+    encoder_output=embeddings[idx].unsqueeze(0).unsqueeze(0),
+    encoder_padding_mask=None,
+  )
+  model_output = vec2text_model.model.project(
+    decoder_output, decoder_padding_mask
+  )
+  logits.append(model_output.logits.squeeze(0))
+
+labels = [
+  seq
+  # seq[1:], # Remove first token (language token)
+  for seq in seqs
+  # hypotheses[0].seq
+  # for hypotheses in generator_output.hypotheses
+]
+
+max_len_logits = max(l.size(0) for l in logits)
+max_len_labels = max(l.size(0) for l in labels)
+max_len = max(max_len_logits, max_len_labels)
+
+logits_padded = []
+for logit in logits:
+  if logit.size(0) < max_len:
+    pad_amt = max_len - logit.size(0)
+    logit = F.pad(logit, (0, 0, 0, pad_amt), value=vec2text_model.tokenizer.vocab_info.pad_idx)
+  logits_padded.append(logit)
+logits_padded = t.stack(logits_padded, dim=0)
+
+labels_padded = []
+for label in labels:
+  if label.size(0) < max_len:
+    pad_amt = max_len - label.size(0)
+    label = F.pad(label, (0, pad_amt), value=vec2text_model.tokenizer.vocab_info.pad_idx)
+  labels_padded.append(label)
+labels_padded = t.stack(labels_padded, dim=0)
+
+loss_fct = nn.CrossEntropyLoss(
+  ignore_index=vec2text_model.tokenizer.vocab_info.pad_idx,
+)
+gen_loss = loss_fct(
+  logits_padded.reshape(-1, logits_padded.size(-1)), # (batch_size * seq_len, vocab_size)
+  labels_padded.reshape(-1)                          # (batch_size * seq_len)
+)
+print("input_to_decoder:")
+for seq in input_to_decoder:
+  print(seq)
+print(f"labels_padded:\n{labels_padded}")
+print("vocab_size:", logits_padded.size(-1))
+print(
+  "random guess loss:", 
+  loss_fct(
+    t.zeros_like(logits_padded).reshape(-1, logits_padded.size(-1)),
+    labels_padded.reshape(-1)
+  ).item()
+)
+print("gen_loss:", gen_loss.item())
+
+greedy_token = logits_padded.argmax(dim=-1)
+print("greedy_token:", greedy_token)
+for seq in greedy_token:
+  print(text_decoder(seq))
+
+# %%
+
+# Batch
+input_to_decoder = [
+  t.cat([
+    t.tensor(
+      [
+        text2vec_model.tokenizer.vocab_info.eos_idx,
+      ]
+    ),
+    seq[:-1], # Remove last token (EOS)
+    # seq[1:-1],  # Remove first and last token (language token and EOS)
+  ])
+  for seq in seqs
+]
+input_data = collater(input_to_decoder)
+input_tensor, input_padding_mask = get_seqs_and_padding_mask(data=input_data)
+
+decoder_output, decoder_padding_mask = vec2text_model.model.decode(
+  seqs=input_tensor,
+  padding_mask=input_padding_mask,
+  encoder_output=embeddings.unsqueeze(1),
+  encoder_padding_mask=None,
+)
+
+model_output = vec2text_model.model.project(
+  decoder_output=decoder_output, 
+  decoder_padding_mask=decoder_padding_mask,
+)
+
+logits = model_output.logits
+
+labels = seqs
+# labels = [
+#   # seq[1:]  # Remove first token (language token)
+#   seq
+#   for seq in seqs
+#   # hypotheses[0].seq
+#   # for hypotheses in generator_output.hypotheses
+# ]
+
+max_len_logits = logits.size(1)
+max_len_labels = max(l.size(0) for l in labels)
+max_len = max(max_len_logits, max_len_labels)
+
+logits_padded = logits
+if logits.size(1) < max_len:
+  pad_amt = max_len - logits.size(1)
+  logits_padded = F.pad(logits, (0, 0, 0, pad_amt), value=vec2text_model.tokenizer.vocab_info.pad_idx)
+
+labels_padded = []
+for label in labels:
+  if label.size(0) < max_len:
+    pad_amt = max_len - label.size(0)
+    label = F.pad(label, (0, pad_amt), value=vec2text_model.tokenizer.vocab_info.pad_idx)
+  labels_padded.append(label)
+labels_padded = t.stack(labels_padded, dim=0)
+
+loss_fct = nn.CrossEntropyLoss(
+  ignore_index=vec2text_model.tokenizer.vocab_info.pad_idx,
+)
+gen_loss = loss_fct(
+  logits_padded.reshape(-1, logits_padded.size(-1)), # (batch_size * seq_len, vocab_size)
+  labels_padded.reshape(-1)                          # (batch_size * seq_len)
+)
+print("input_to_decoder:")
+for seq in input_to_decoder:
+  print(seq)
+print(f"labels_padded:\n{labels_padded}")
+print("vocab_size:", logits_padded.size(-1))
+print(
+  "random guess loss:", 
+  loss_fct(
+    t.zeros_like(logits_padded).reshape(-1, logits_padded.size(-1)),
+    labels_padded.reshape(-1)
+  ).item()
+)
+print("gen_loss:", gen_loss.item())
+
+greedy_token = logits_padded.argmax(dim=-1)
+print("greedy_token:", greedy_token)
+for seq in greedy_token:
+  print(text_decoder(seq))
+
+# %%
+
+print("Decoding with greedy search...")
+batch_size = embeddings.shape[0]
+input_seqs = t.tensor(
+  [
+    [
+      text2vec_model.tokenizer.vocab_info.eos_idx,
+      256047, # language token
+    ]
+    for _ in range(batch_size)
+  ],
+)
+
+with t.no_grad():
+  for i in range(max_seq_len):
+    decoder_output, decoder_padding_mask = vec2text_model.model.decode(
+      seqs=input_seqs,
+      padding_mask=None,
+      encoder_output=embeddings.unsqueeze(1),
+      encoder_padding_mask=None,
+    )
+    greedy_token = (
+      vec2text_model.model.project(
+        decoder_output=decoder_output, 
+        decoder_padding_mask=decoder_padding_mask,
+      )
+      .logits[:, -1, :]
+      .argmax(dim=-1, keepdim=True)
+    )
+    input_seqs = t.cat(
+      [input_seqs, greedy_token],
+      dim=1
+    )
+    if (
+      greedy_token[-1] == text2vec_model.tokenizer.vocab_info.eos_idx
+    ):
+      break
+
+print(f"seqs:\n{input_seqs}")
+
+for seq in input_seqs:
+  print(text_decoder(seq))
+
+# %%
+
+input_a = [
+  "hello world", 
+  "lorem ipsum",
+]
+input_b = [
+  "world", 
+  "ipsum",
+]
+input_target = [
+  "hello", 
+  "ipsum",
+]
+# input_target = None
+
+batch_size = len(input_a)
+
+input = input_a + input_b + input_target
+
+embeddings = text2vec_model.predict(
+  input=input,
+  source_lang="eng_Latn",
+)
+print("embeddings shape:", embeddings.shape)
+
+embeddings = embeddings.clone()
+# norm1 = nn.RMSNorm(1024)
+# embeddings = norm1(embeddings)
+
+embeddings_a = embeddings[0:batch_size]
+embeddings_b = embeddings[batch_size:2*batch_size]
+embeddings_target = embeddings[2*batch_size:]
+
+print("embeddings_a shape:", embeddings_a.shape)
+print("embeddings_b shape:", embeddings_b.shape)
+
+embeddings_a_b = t.cat([embeddings_a, embeddings_b], dim=1)
+print("embeddings_a_b shape:", embeddings_a_b.shape)
+
+linear = nn.Linear(2*1024, 1024)
+embeddings_sim = linear(embeddings_a_b) # shape: (batch_size, 1024)
+# norm2 = nn.RMSNorm(1024)
+# embeddings_sim = norm2(embeddings_sim)
+print("embeddings_sim shape:", embeddings_sim.shape)
+
+embeddings_pos = embeddings_target
+
+texts = vec2text_model.predict(
+  inputs=embeddings_sim,
+  target_lang="eng_Latn",
+  max_seq_len=10,
+)
+print("texts:", texts)
+
+if input_target is not None:
+  sim_fct = nn.CosineSimilarity(dim=-1)
+  temperature = 0.05
+  cos_sim = sim_fct(
+    embeddings_sim.unsqueeze(dim=1), # shape: (batch_size, 1, n_embd)
+    embeddings_pos.unsqueeze(dim=0), # shape: (1, batch_size, n_embd)
+  ) / temperature
+  # cos_sim = t.tensor([
+  #   [1.0, -1.0],
+  #   [-1.0, 1.0],
+  # ])
+  print(f"cos_sim:\n{cos_sim}")
+  cl_labels = t.arange(batch_size, device=embeddings_sim.device)
+  loss_fct = nn.CrossEntropyLoss()
+  cl_loss = loss_fct(cos_sim, cl_labels)
+  print("cl_loss:", cl_loss.item())
+
+# %%
+
+input = [
+  "hello world",
+  "world is a beautiful place",
+  "my name is jeff",
+  "name is a way to identify a person",
+  "is that true",
+  "jeff is the CEO of the company",
+  "and he is a good person",
+  "you can trust him",
+]
+
+embeddings = text2vec_model.predict(
+  input=input,
+  source_lang="eng_Latn",
+)
+
+sim_fct = nn.CosineSimilarity(dim=-1)
+temperature = 0.05
+cos_sim = sim_fct(
+  embeddings.unsqueeze(dim=1), # shape: (batch_size, 1, n_embd)
+  embeddings.unsqueeze(dim=0), # shape: (1, batch_size, n_embd)
+) / temperature
+print(f"cos_sim:\n{cos_sim}")
+loss_fct = nn.CrossEntropyLoss()
+cl_labels = t.arange(len(input), device=embeddings.device)
+cl_loss = loss_fct(cos_sim, cl_labels)
+print("cl_loss:", cl_loss.item())
+
+# %%
+
+preds = [
+  "hello world",
+  "hello",
+]
+target = [
+  "hello world",
+  "hello ?",
+]
+
+print(f"preds: {preds}")
+print(f"target: {target}")
+
+rouge = ROUGEScore(accumulate='avg')
+rouge_score = rouge(preds, target)
+rogue1_fmeasure = rouge_score["rouge1_fmeasure"]
+rogue2_fmeasure = rouge_score["rouge2_fmeasure"]
+rougeL_fmeasure = rouge_score["rougeL_fmeasure"]
+rougeLsum_fmeasure = rouge_score["rougeLsum_fmeasure"]
+
+print(f"rouge1_fmeasure: {rogue1_fmeasure:.4f}")
+# unigrams, rogue1_fmeasure=1.0 because the target unigrams are 3, 
+# and it matched 3 times in preds
+print(f"rouge2_fmeasure: {rogue2_fmeasure:.4f}")
+# bigrams
+print(f"rougeL_fmeasure: {rougeL_fmeasure:.4f}")
+# longest common subsequence
+print(f"rougeLsum_fmeasure: {rougeLsum_fmeasure:.4f}")
+# LCS over concatenated text
+
+# %%
diff --git a/notes/naive_difference_operator.py b/notes/naive_difference_operator.py
new file mode 100644
index 0000000..e4f3dfb
--- /dev/null
+++ b/notes/naive_difference_operator.py
@@ -0,0 +1,428 @@
+# %% [markdown]
+"""
+1. `'Capital of' Vector = Embedding Relation - Embedding Control` does not work
+1. `Inversion Vector = Embedding Passive - Embedding Active` does not work
+2. `Relational Vector = Embedding Sentence - Embedding Cause - Embedding Effect` does not work
+3. `Politeness Vector = Embedding Polite - Embedding Impolite` works for seen sentences, but not for unseen sentences
+"""
+
+# %%
+
+import os
+import sys
+
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+if project_root not in sys.path:
+  print(f"Adding project root to sys.path: {project_root}")  
+  sys.path.append(project_root)
+
+# %%
+
+from argparse import ArgumentParser
+from typing import TypedDict
+
+import torch as t
+from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
+from sonar.inference_pipelines.text import EmbeddingToTextModelPipeline
+
+from sklearn.decomposition import PCA
+import plotly.graph_objects as go
+
+# %%
+
+class Args(TypedDict):
+  workspace_path: str
+  device: str
+
+def parse_args():
+  parser = ArgumentParser(description="Run interpretable relational experiments.")
+
+  parser.add_argument(
+    "--workspace_path",
+    type=str,
+    help="Path to the workspace directory where the experiments will be run.",
+  )
+
+  parser.add_argument(
+    "--device",
+    type=str,
+    help="Device to run the model on (e.g., 'cuda:0' for GPU, 'cpu' for CPU).",
+  )
+
+  ns = parser.parse_args()
+  return Args(**vars(ns))
+
+if False:
+  WORKSPACE_PATH = "/workspace/ALGOVERSE/UJR/jason"
+  sys.argv = [
+    'main.py',
+    '--workspace_path', WORKSPACE_PATH,
+    '--device', 'cuda:1',
+  ]
+
+args = parse_args()
+
+print("Parsed arguments:")
+for key, value in args.items():
+  print(f"{key}: {value}")
+
+# %%
+
+print("Loading text-to-embedding and embedding-to-text models...")
+text2vec_model = TextToEmbeddingModelPipeline(
+  encoder="text_sonar_basic_encoder",
+  tokenizer="text_sonar_basic_encoder",
+  device=t.device(args["device"]),
+)
+
+vec2text_model = EmbeddingToTextModelPipeline(
+  decoder="text_sonar_basic_decoder",
+  tokenizer="text_sonar_basic_decoder",
+  device=t.device(args["device"]),
+)
+
+# %% [markdown]
+""" 
+$$\vec{v}_{\text{capital\_of}} = S_{\text{relation}} - S_{\text{control}}$$
+
+Conclusion: It doesn't work
+"""
+
+sentences = [
+  ("Paris is the capital of France.", # S_relation
+   "Paris and France are both in Europe.", # S_control
+   "Berlin is a large city in Germany."), # S_target
+  ("Berlin is the capital of Germany.",
+   "Berlin and Germany are both in Europe.",
+   "Paris is a large city in France."),
+  ("Madrid is the capital of Spain.",
+   "Madrid and Spain are both in Europe.",
+   "Lisbon is a large city in Portugal."),
+  ("Rome is the capital of Italy.",
+   "Rome and Italy are both in Europe.",
+   "Athens is a large city in Greece."),
+  ("London is the capital of the United Kingdom.",
+   "London and the United Kingdom are both in Europe.",
+   "Dublin is a large city in Ireland."),
+]
+
+embeddings = text2vec_model.predict(
+  [sentence for tup in sentences for sentence in tup],
+  source_lang="eng_Latn",
+)
+
+v_capital_of = t.stack([
+  embeddings[i * 3 + 0] - embeddings[i * 3 + 1]
+  for i in range(len(sentences))
+]).mean(dim=0)
+
+intervened_embeddings = t.stack([
+  embeddings[i * 3 + 2] + v_capital_of
+  for i in range(len(sentences))
+])
+
+decoded_v_capital_of = vec2text_model.predict(
+  [v_capital_of],
+  target_lang="eng_Latn",
+)
+
+decoded = vec2text_model.predict(
+  intervened_embeddings,
+  target_lang="eng_Latn",
+)
+
+print(f"S_decoded_v_capital_of: {decoded_v_capital_of[0]}")
+
+for i, tup in enumerate(sentences):
+  print(f"S_relation_{i}: {tup[0]}")
+  print(f"S_control_{i}: {tup[1]}")
+  print(f"S_target_{i}: {tup[2]}")
+  print(f"S_decoded_{i}: {decoded[i]}")
+  print()
+
+# %% [markdown]
+"""
+$$\vec{v}_\text{inversion} = S_{\text{passive}} - S_{\text{active}}$$
+
+Conclusion: It does not work
+"""
+
+sentences = [
+  ("The heavy rain caused flooding", 
+   "The flooding was caused by the heavy rain"),
+  ("The dog chased the cat.", 
+   "The cat was chased by the dog."),
+  ("The teacher praised the student.", 
+   "The student was praised by the teacher."),
+  ("The chef cooked a delicious meal.", 
+   "A delicious meal was cooked by the chef."),
+  ("The artist painted a beautiful picture.", 
+   "A beautiful picture was painted by the artist."),
+  ("The scientist discovered a new species.", 
+   "A new species was discovered by the scientist."),
+]
+
+embeddings = text2vec_model.predict(
+  [sentence for tup in sentences for sentence in tup], 
+  source_lang="eng_Latn",
+)
+
+v_inversion = t.stack([
+  embeddings[i * 2 + 1] - embeddings[i * 2 + 0]
+  for i in range(len(sentences))
+]).mean(dim=0)
+
+intervened_embeddings = t.stack([
+  embeddings[i * 2 + 0] + v_inversion
+  for i in range(len(sentences))
+])
+
+decoded_v_inversion = vec2text_model.predict(
+  [v_inversion],
+  target_lang="eng_Latn",
+)
+
+decoded = vec2text_model.predict(
+  intervened_embeddings,
+  target_lang="eng_Latn",
+)
+
+print(f"S_decoded_v_inversion: {decoded_v_inversion[0]}")
+for i, tup in enumerate(sentences):
+  print(f"S_active_{i}: {tup[0]}")
+  print(f"S_passive_{i}: {tup[1]}")
+  print(f"S_decoded_{i}: {decoded[i]}")
+  print()
+
+# %% [markdown]
+"""
+$$\vec{v}_{\text{relational}_i} = z_{\text{sentence}_i} - z_{\text{cause}_i} - z_{\text{effect}_i}$$
+
+$$\text{alignment\_score} = \text{cosine\_similarity}(\vec{v}_\text{claimed\_relation},\vec{v}_\text{relational})$$
+
+Conclusion: It does not work
+"""
+
+sentences = [
+  ("Heavy rain caused flooding", # S_sentence
+    "Heavy rain _ _", # S_cause
+    "_ _ _ flooding"), # S_effect
+  ("Heavy rain resulted in flooding",
+   "Heavy rain _ _ _",
+   "_ _ _ flooding"),
+  ("Heavy rain led to flooding",
+   "Heavy rain _ _ _",
+   "_ _ _ _ flooding"),
+  ("Smoking increases the risk of cancer",
+    "Smoking _ _ _ _ _",
+    "_ the risk of cancer"),
+  ("Smoking is a major cause of cancer",
+   "Smoking _ _ _ _ _ _",
+   "_ _ _ _ _ _ cancer"),
+  ("Smoking contributes to the development of cancer",
+   "Smoking _ _ _ _ _ _",
+   "_ _ _ the development of cancer"),
+]
+
+# sentences = [
+#   tuple(s.replace("_", "").strip() for s in tup)
+#   for tup in sentences
+# ]
+
+claimed_sentences = [
+  ("Vaccines caused a significant reduction in disease incidence", # S_sentence
+   "Vaccines _ _ _ _ _ _ _", # S_cause
+   "_ _ a significant reduction in disease incidence", # S_effect
+   "OK"),
+  ("Vaccines cause autism",
+   "Vaccines _ _",
+   "_ _ autism",
+   "Misleading claim"),
+  ("Vaccines lead to autism",
+   "Vaccines _ _ _",
+   "_ _ _ autism",
+   "Misleading claim"),
+  ("Vaccines save lives",
+   "Vaccines _ _",
+   "_ _ lives",
+   "OK"),
+  ("Vaccines prevent disease",
+   "Vaccines _ _",
+   "_ _ disease",
+   "OK"),
+  ("Vaccines protect against infections",
+   "Vaccines _ _ _",
+   "_ _ against infections",
+   "OK"),
+  ("Vacciness cause autism. This is a fact",
+   "Vaccines _ _. _ _ _ _",
+   "_ _ autism. _ _ _ _ _",
+   "Misleading claim"),
+  ("Vaccines are safe and effective",
+   "Vaccines _ _ _ _",
+   "_ _ _ _ effective",
+   "OK"),
+  ("Global warming is caused by solar activity",
+   "Global warming _ _ _ _ _",
+   "_ _ _ _ _ solar activity",
+   "Misleading claim"),
+]
+
+# claimed_sentences = [
+#   tuple(s.replace("_", "").strip() for s in tup)
+#   for tup in claimed_sentences
+# ]
+
+embeddings = text2vec_model.predict(
+  [sentence for tup in sentences for sentence in tup],
+  source_lang="eng_Latn",
+)
+
+claimed_embeddings = text2vec_model.predict(
+  [sentence for tup in claimed_sentences for sentence in tup],
+  source_lang="eng_Latn",
+)
+
+v_relational = t.stack([
+  embeddings[i * 3 + 0] - embeddings[i * 3 + 1] - embeddings[i * 3 + 2]
+  for i in range(len(sentences))
+])
+
+v_claimed_relational = t.stack([
+  claimed_embeddings[i * 3 + 0] - claimed_embeddings[i * 3 + 1] - claimed_embeddings[i * 3 + 2]
+  for i in range(len(claimed_sentences))
+])
+
+alignment_score = t.cosine_similarity(
+  v_relational.mean(dim=0),
+  v_claimed_relational,
+)
+
+for i, tup in enumerate(claimed_sentences):
+  print(f"S_claimed_sentence_{i}: {tup[0]}")
+  print(f"alignment_score_{i}: {alignment_score[i].item()}")
+  print(f"norm_{i}: {v_claimed_relational[i].norm().item():.4f}")
+
+pca = PCA(n_components=2)
+v_relational_2d = pca.fit_transform(
+  v_relational.cpu().numpy(),
+)
+
+print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")
+
+v_claimed_relational_2d = pca.transform(
+  v_claimed_relational.cpu().numpy(),
+)
+
+fig = go.Figure()
+fig.add_trace(go.Scatter(
+  x=v_relational_2d[:, 0], 
+  y=v_relational_2d[:, 1], 
+  mode='markers', 
+  name="Relational",
+  marker=dict(color="blue", opacity=0.5),
+  text=[sentences[i][0] for i in range(len(v_relational_2d))],
+))
+claimed_labels = [tup[3] for tup in claimed_sentences]
+ok_mask = [label == "OK" for label in claimed_labels]
+misleading_mask = [label == "Misleading claim" for label in claimed_labels]
+fig.add_trace(go.Scatter(
+  x=v_claimed_relational_2d[ok_mask, 0], 
+  y=v_claimed_relational_2d[ok_mask, 1], 
+  mode='markers',
+  name='Claimed OK',
+  marker=dict(color='green', opacity=0.7),
+  text=[claimed_sentences[i][0] for i, ok in enumerate(ok_mask) if ok]
+))
+fig.add_trace(go.Scatter(
+  x=v_claimed_relational_2d[misleading_mask, 0], 
+  y=v_claimed_relational_2d[misleading_mask, 1], 
+  mode='markers',
+  name='Claimed Misleading',
+  marker=dict(color='red', opacity=0.7),
+  text=[claimed_sentences[i][0] for i, ms in enumerate(misleading_mask) if ms]
+))
+
+fig.update_layout(
+  title="PCA of Relational and Claimed Relational Vectors",
+  xaxis_title="PCA Component 1",
+  yaxis_title="PCA Component 2",
+  legend=dict(title="Legend"),
+)
+
+# %% [markdown]
+"""
+$$\vec{v}_\text{politeness} = S_{\text{polite}} - S_{\text{impolite}}$$
+
+Conclusion: It works for seen sentences, but not for unseen sentences
+"""
+
+sentences = [
+  ("Could you help me?", "Help me!"),
+  ("Can you assist me?", "Assist me!"),
+  ("Would you be able to help me?", "Help me!"),
+  ("I need your assistance.", "Need your assistance!"),
+  ("Please help me with this task.", "Help me with this task!"),
+]
+
+unseen_sentences = [
+  ("Look at this!", "Impolite"),
+  ("Pay attention to this!", "Impolite"),
+  ("Just do it!", "Impolite"),
+  ("I need you to do this!", "Impolite"),
+  ("Can you do this for me?", "Polite"),
+  ("I would appreciate your help with this.", "Polite"),
+]
+
+embeddings = text2vec_model.predict(
+  [sentence for tup in sentences for sentence in tup],
+  source_lang="eng_Latn",
+)
+
+unseen_embeddings = text2vec_model.predict(
+  [sentence for tup in unseen_sentences for sentence in tup],
+  source_lang="eng_Latn",
+)
+
+v_politeness = t.stack([
+  embeddings[i * 2 + 0] - embeddings[i * 2 + 1]
+  for i in range(len(sentences))
+]).mean(dim=0)
+
+intervened_embeddings = t.stack([
+  embeddings[i * 2 + 1] + v_politeness
+  for i in range(len(sentences))
+])
+
+intervened_unseen_embeddings = t.stack([
+  unseen_embeddings[i * 2 + 1] + v_politeness
+  for i in range(len(unseen_sentences))
+])
+
+decoded_v_politeness = vec2text_model.predict(
+  [v_politeness],
+  target_lang="eng_Latn",
+)
+decoded = vec2text_model.predict(
+  intervened_embeddings,
+  target_lang="eng_Latn",
+)
+decoded_unseen = vec2text_model.predict(
+  intervened_unseen_embeddings,
+  target_lang="eng_Latn",
+)
+
+print(f"S_decoded_v_politeness: {decoded_v_politeness[0]}")
+for i, tup in enumerate(sentences):
+  print(f"S_polite_{i}: {tup[0]}")
+  print(f"S_impolite_{i}: {tup[1]}")
+  print(f"S_decoded_{i}: {decoded[i]}")
+  print()
+
+for i, tup in enumerate(unseen_sentences):
+  print(f"S_unseen_{i}: {tup[0]}")
+  print(f"S_unseen_label_{i}: {tup[1]}")
+  print(f"S_decoded_unseen_{i}: {decoded_unseen[i]}")
+  print()
+
+# %%
diff --git a/notes/pca.py b/notes/pca.py
new file mode 100644
index 0000000..ae840a6
--- /dev/null
+++ b/notes/pca.py
@@ -0,0 +1,228 @@
+# %% [markdown]
+
+"""
+Personal note:
+1. What is PCA?
+   Principal Component Analysis (PCA) is a linear
+   dimensionality reduction technique. It transforms
+   a dataset with possibly correlated features into
+   a set of linearly uncorrelated variables called
+   principal components. The first principal 
+   component captures the largest possible variance, 
+   the second the next largest (orthogonal to the 
+   first), and so on. PCA is widely used for 
+   visualization, noise reduction and feature 
+   extraction
+"""
+
+# %%
+# Example: PCA from 3D to 2D (Manual and
+# scikit-learn)
+
+# Manual PCA (without scikit-learn)
+import numpy as np
+
+# %%
+
+# Example 3D data (each row is a sample)
+X = np.array([
+  [2.5, 2.4, 0.5],
+  [0.5, 0.7, 0.2],
+  [2.2, 2.9, 0.7],
+  [1.9, 2.2, 0.3],
+  [3.1, 3.0, 0.9],
+  [2.3, 2.7, 0.6],
+  [2.0, 1.6, 0.4],
+  [1.0, 1.1, 0.1],
+  [1.5, 1.6, 0.2],
+  [1.1, 0.9, 0.1]
+])
+
+# %%
+"""
+Personal note:
+1. Why do we center the data?
+   We center the data (subtract the mean of each
+   feature/column) so that each feature has a mean
+   of zero.
+   This is important because PCA finds directions
+   of maximum variance from the origin
+   If the data is not centered, the first principal
+   component may simply point toward the mean of
+   the data, not the true direction of maximum
+   variance
+"""
+
+# 1. Center the data
+X_mean = np.mean(X, axis=0) # mean of each column
+X_meaned = X - X_mean
+
+print(f"X_mean:\n{X_mean}")
+print(f"X_meaned:\n{X_meaned}")
+
+# %%
+"""
+Personal note:
+2. How is the covariance matrix computed?
+   Example without `np.cov`
+   The covariance matrix measures how much
+   each pair of features varies together.
+   For data matrix `X` (shape: samples x features),
+   the covariance between feature `i` and `j` is:
+
+    Cov(i, j) = (
+      (1 / (n - 1)) 
+      * \sum_{k=1}^n (X_{k,i} - \bar{X}_i) * (X_{k,j} - \bar{X}_j)
+    )
+"""
+
+# Example: Compute covariance matrix manually
+n_samples = X_meaned.shape[0]
+cov_manual = (X_meaned.T @ X_meaned) / (n_samples - 1)
+
+# 2. Compute covariance matrix
+cov_mat = np.cov(X_meaned, rowvar=False)
+
+print(f"Manual covariance matrix:\n{cov_manual}")
+print(f"Covariance matrix:\n{cov_mat}")
+
+# %%
+
+"""
+Personal note:
+1. What are eigenvalues and eigenvectors?
+   - Eigenvectors of a square matrix (A) are a special
+     nonzero vectors (v) such that multiplying (A)
+     by (v) only stretches or shrinks (v),
+     not changing its direction: [A v = \lambda v],
+     where (\lambda) is a scalar called eigenvalue
+     corresponding to eigenvector (v).
+   - In PCA, the eigenvectors of the covariance
+     matrix point in the directions of maximum
+     variance (principal components), and the
+     eigenvalues tell you how much variance is in
+     those directions
+2. How are eigenvalues and eigenvectors computed?
+   Example without `np.linalg.eigh`
+   Theory
+   - For a square matrix (A), eigenvalues (\lambda)
+     satisfy: [ \det(A - \lambda I) = 0]. This
+     is called the characteristic equation
+   - For each eigenvalue, the corresponding
+     eigenvector (v) solves: [ (A - \lambda I)v = 0],
+"""
+
+# 3. Comptue eigenvalues and eigenvectors
+eig_vals, eig_vecs = np.linalg.eigh(cov_mat)
+
+print(f"eigenvalues:\n{eig_vals}")
+print(f"eigenvectors:\n{eig_vecs}")
+
+# %%
+
+# 4. Sort eigenvectors by eigenvalues (descending)
+sorted_idx = np.argsort(eig_vals)[::-1]
+eig_vals = eig_vals[sorted_idx]
+eig_vecs = eig_vecs[:, sorted_idx]
+
+print("Sorted eigenvalues:\n", eig_vals)
+print("Sorted eigenvectors:\n", eig_vecs)
+
+# %%
+
+# 5. Select top 2 eigenvectors (for 2D)
+eig_vecs_2d = eig_vecs[:, :2]
+
+"""
+Personal note:
+1. Why we select top 2 eigenvectors based on the
+   eigen values? Why we want to use the eigenvector
+   which has the largest eigen values?
+   
+   We select the top 2 eigenvectors based on the
+   largest eigenvalues because:
+   - Each eigenvector of the covariance matrix points
+     in a direction in feature space
+   - Each eigenvalue tells us how much variance (spread
+     of the data) there is along its corresponding
+     eigenvector
+   
+   Why use the largest eigenvalues?
+   - The eigenvectors with the largest eigenvalues
+     capture the most variance in the data
+   - By projecting onto these directions, we retain
+     as much information (variance) as possible
+     in fewer dimensions
+   - This is the core idea of PCA: reduce
+     dimensionality while preserving the most
+     important structure in the data
+
+   Summary:
+   We use the eigenvectors with the largest eigenvalues
+   because they represent the directions
+   where the data varies the most, which is
+   what we want to keep in a lower-dimensional
+   representation
+"""
+
+# %%
+
+# 6. Project data onto new 2D space
+X_pca_manual = np.dot(X_meaned, eig_vecs_2d)
+print("Manual PCA result (first 2D points):\n", X_pca_manual[:3])
+
+# %%
+# scikit-learn PCA (for validation)
+
+from sklearn.decomposition import PCA
+
+pca = PCA(n_components=2)
+X_pca_sklearn = pca.fit_transform(X)
+
+print("sklearn PCA result (first 2D points):\n", X_pca_sklearn[:3])
+
+# %%
+
+explained_variance_ratio_manual = eig_vals / np.sum(eig_vals)
+
+print("Manual PCA explained variance ratio:", explained_variance_ratio_manual)
+print("PCA explained variance ratio:", pca.explained_variance_ratio_)
+
+"""
+Personal note:
+3. What is PCA Explained Variance Ratio? 
+   Is [0.1, 0.017] good?
+   - Explained Variance Ratio: For each principal
+     component, this value shows the proportion of
+     the dataset's total variance captured by that
+     component. For example, `[0.1, 0.017]`
+     means the first component explains 10% of the
+     variance, the second 1.7%
+   
+   Is `0.1, 0.017` good?
+   - No, it's not good for visualization or
+     interpretation.
+     - Together, the first two components explain
+       only 11.7% of the total variance.
+     - This means that a 2D plot will not capture
+       most of the structure in your data
+     - The plot may be misleading: clusters or
+       patterns you see may not reflect the
+       true relationships in the high-dimensional
+       space.
+
+   Rule of thumb: For PCA plots to be meaningful,
+   you typically want the first two components to
+   explain a substantial portion of the variance
+   (e.g., >50%). If not, be cautious in interpreting
+   the plot
+
+   Summary:
+   - PCA finds the directions of maximum variance.
+   - You can do PCA manually with numpy (see above)
+   - Explained variance ratio tells you how much of
+     the data's structure is captured
+   - `[0.1, 0.017]` is low; the plot may not be
+     representative of the real data structure
+"""
+# %%
diff --git a/notes/pca_steering.py b/notes/pca_steering.py
new file mode 100644
index 0000000..95ac0da
--- /dev/null
+++ b/notes/pca_steering.py
@@ -0,0 +1,330 @@
+# %%
+
+print("Importing libraries...")
+import numpy as np
+import plotly.graph_objs as go
+from sklearn.decomposition import PCA
+
+# %%
+
+print("Preparing data for PCA...")
+original = np.array([
+  [1, 1, 1],
+  [1, 2, 1],
+  [2, 1, 1],
+  [2, 2, 1],
+  [1, 1, 2],
+  [1, 2, 2],
+  [2, 1, 2],
+  [2, 2, 2]
+])
+cluster1 = original * 1 - [[1, 8, 1],
+                           [1, 7, 2],
+                           [1, 6, 3],
+                           [1, 5, 4],
+                           [1, 4, 5],
+                           [1, 3, 6],
+                           [1, 2, 7],
+                           [1, 1, 8]]
+cluster2 = original * 3
+print(f"cluster1:\n{cluster1}")
+print(f"cluster2:\n{cluster2}")
+X = np.vstack([cluster1, cluster2])
+
+# %%
+
+print("Plotting original 3D data...")
+fig = go.Figure()
+fig.add_trace(go.Scatter3d(
+    x=cluster1[:, 0],
+    y=cluster1[:, 1],
+    z=cluster1[:, 2],
+    mode='markers',
+    marker=dict(size=6, color='blue'),
+    name='cluster1'
+))
+fig.add_trace(go.Scatter3d(
+    x=cluster2[:, 0],
+    y=cluster2[:, 1],
+    z=cluster2[:, 2],
+    mode='markers',
+    marker=dict(size=6, color='red'),
+    name='cluster2'
+))
+fig.update_layout(
+    scene=dict(
+        xaxis_title='X',
+        yaxis_title='Y',
+        zaxis_title='Z'
+    ),
+    margin=dict(l=0, r=0, b=0, t=0)
+)
+fig.show()
+
+# %%
+
+print("Performing PCA to reduce from 3D to 2D...")
+pca = PCA(n_components=2)
+X_2d = pca.fit_transform(X)
+
+# %%
+
+print("Plotting PCA result in 2D...")
+fig = go.Figure()
+fig.add_trace(go.Scatter(
+    x=X_2d[:8, 0],
+    y=X_2d[:8, 1],
+    mode='markers',
+    marker=dict(size=6, color='blue'),
+    name='cluster1'
+))
+fig.add_trace(go.Scatter(
+    x=X_2d[8:, 0],
+    y=X_2d[8:, 1],
+    mode='markers',
+    marker=dict(size=6, color='red'),
+    name='cluster2'
+))
+fig.update_layout(
+    xaxis_title='PC1',
+    yaxis_title='PC2',
+    margin=dict(l=0, r=0, b=0, t=0)
+)
+fig.show()
+
+# %%
+
+point_idx = 1
+scaling = 1.0
+
+print("Shifting a point towards the direction of cluster2...")
+
+original_point = X[point_idx]
+
+cluster1_mean = cluster1.mean(axis=0)
+cluster2_mean = cluster2.mean(axis=0)
+direction = cluster2_mean - cluster1_mean
+direction_unit = direction / np.linalg.norm(direction)
+
+point_to_cluster2_mean = cluster2_mean - original_point
+distance_to_cluster2 = np.dot(direction_unit, point_to_cluster2_mean)
+adaptive_shift = distance_to_cluster2 * direction_unit
+
+# equivalent
+# proj = np.dot(direction_unit, point_to_cluster2_mean)
+# total_distance = np.linalg.norm(direction)
+# adaptive_shift = proj / total_distance * direction
+
+shifted_point = original_point + scaling * adaptive_shift
+
+original_2d = pca.transform([original_point])[0]
+shifted_2d = pca.transform([shifted_point])[0]
+
+print("Plot original and shifted point with an arrow")
+fig = go.Figure()
+
+# Plot cluster1 and cluster2 as before
+fig.add_trace(go.Scatter3d(
+    x=cluster1[:, 0],
+    y=cluster1[:, 1],
+    z=cluster1[:, 2],
+    mode='markers',
+    marker=dict(size=6, color='blue'),
+    name='cluster1',
+    text=[f"id: {i}" for i in range(len(cluster1))]  # Add id as customdata
+))
+fig.add_trace(go.Scatter3d(
+    x=cluster2[:, 0],
+    y=cluster2[:, 1],
+    z=cluster2[:, 2],
+    mode='markers',
+    marker=dict(size=6, color='red'),
+    name='cluster2',
+    text=[f"id: {i+8}" for i in range(len(cluster2))]  # Add id as customdata
+))
+
+# Plot the cluster1 and cluster2 means
+fig.add_trace(go.Scatter3d(
+    x=[cluster1_mean[0]],
+    y=[cluster1_mean[1]],
+    z=[cluster1_mean[2]],
+    mode='markers',
+    marker=dict(size=6, color='blue', symbol='circle-open'),
+    name='cluster1 mean'
+))
+fig.add_trace(go.Scatter3d(
+   x=[cluster2_mean[0]],
+   y=[cluster2_mean[1]],
+   z=[cluster2_mean[2]],
+   mode='markers',
+   marker=dict(size=6, color='red', symbol='circle-open'),
+   name='cluster2 mean',
+))
+
+# Plot the direction unit as axis
+fig.add_trace(go.Scatter3d(
+    x=[cluster1_mean[0] - direction_unit[0], cluster2_mean[0] + direction_unit[0]],
+    y=[cluster1_mean[1] - direction_unit[1], cluster2_mean[1] + direction_unit[1]],
+    z=[cluster1_mean[2] - direction_unit[2], cluster2_mean[2] + direction_unit[2]],
+    mode='lines+markers',
+    marker=dict(size=2, color='black'),
+    line=dict(color='black', width=4, dash='dash'),
+    name='direction unit axis'
+))
+
+# Plot the point_to_cluster2_mean vector
+fig.add_trace(go.Scatter3d(
+    x=[original_point[0], cluster2_mean[0]],
+    y=[original_point[1], cluster2_mean[1]],
+    z=[original_point[2], cluster2_mean[2]],
+    mode='lines+markers',
+    marker=dict(size=2, color='purple'),
+    line=dict(color='purple', width=6, dash='dash'),
+    name='point_to_cluster2_mean',
+))
+
+# Plot the shifted point
+fig.add_trace(go.Scatter3d(
+    x=[shifted_point[0]],
+    y=[shifted_point[1]],
+    z=[shifted_point[2]],
+    mode='markers',
+    marker=dict(size=6, color='orange', symbol='diamond'),
+    name='shifted_point',
+))
+
+# Add an arrow from original_point to shifted_point
+fig.add_trace(go.Scatter3d(
+    x=[original_point[0], shifted_point[0]],
+    y=[original_point[1], shifted_point[1]],
+    z=[original_point[2], shifted_point[2]],
+    mode='lines+markers',
+    marker=dict(size=2, color='black'),
+    line=dict(color='black', width=6, dash='dash'),
+    name='shift_vector'
+))
+
+# Define and plot the dividing plane
+# Create a meshgrid for the plane
+xx, yy = np.meshgrid(
+  np.linspace(cluster2_mean[0] - 1, cluster2_mean[0] + 1, 10), 
+  np.linspace(cluster2_mean[1] - 1, cluster2_mean[1] + 1, 10)
+)
+# Reference: https://www.youtube.com/watch?v=2sZKZHyaQJ8
+# Equation of the plane given a point
+# and perpendicular normal vector:
+# a(x-x0) + b(y-y0) + c(z-z0) = 0
+# where (a,b,c) is the normal vector (direction_unit) 
+# and (x0,y0,z0) is a point on the plane (cluster2_mean)
+# We solve for z: z = z0 - (a(x-x0) + b(y-y0)) / c
+a, b, c = direction_unit
+x0, y0, z0 = cluster2_mean
+zz = z0 - (a * (xx - x0) + b * (yy - y0)) / c
+
+fig.add_trace(go.Surface(
+    x=xx, y=yy, z=zz,
+    colorscale='Greys',
+    opacity=0.5,
+    showscale=False,
+    name='Dividing Plane'
+))
+
+fig.update_layout(
+    scene=dict(
+        xaxis_title='X',
+        yaxis_title='Y',
+        zaxis_title='Z',
+        aspectmode='data'
+    ),
+    margin=dict(l=0, r=0, b=0, t=0)
+)
+fig.show()
+
+# %%
+
+print("Plot PCA result in 2D with shifted point...")
+fig = go.Figure()
+
+# Plot cluster1 and cluster2 in 2D
+fig.add_trace(go.Scatter(
+    x=X_2d[:8, 0],
+    y=X_2d[:8, 1],
+    mode='markers',
+    marker=dict(size=6, color='blue'),
+    name='cluster1'
+))
+fig.add_trace(go.Scatter(
+    x=X_2d[8:, 0],
+    y=X_2d[8:, 1],
+    mode='markers',
+    marker=dict(size=6, color='red'),
+    name='cluster2'
+))
+
+# Plot the cluster1 and cluster2 means in 2D
+cluster1_mean_2d = pca.transform([cluster1_mean])[0]
+cluster2_mean_2d = pca.transform([cluster2_mean])[0]
+fig.add_trace(go.Scatter(
+    x=[cluster1_mean_2d[0]],
+    y=[cluster1_mean_2d[1]],
+    mode='markers',
+    marker=dict(size=6, color='blue', symbol='circle-open'),
+    name='cluster1 mean'
+))
+fig.add_trace(go.Scatter(
+    x=[cluster2_mean_2d[0]],
+    y=[cluster2_mean_2d[1]],
+    mode='markers',
+    marker=dict(size=6, color='red', symbol='circle-open'),
+    name='cluster2 mean'
+))
+
+# Plot the direction unit as axis in 2D
+direction_unit_2d = direction_unit @ pca.components_.T
+direction_unit_2d = direction_unit_2d / np.linalg.norm(direction_unit_2d)
+fig.add_trace(go.Scatter(
+    x=[cluster2_mean_2d[0] - direction_unit_2d[0], cluster2_mean_2d[0] + direction_unit_2d[0]],
+    y=[cluster2_mean_2d[1] - direction_unit_2d[1], cluster2_mean_2d[1] + direction_unit_2d[1]],
+    mode='lines+markers',
+    marker=dict(size=2, color='black'),
+    line=dict(color='black', width=2, dash='dash'),
+    name='direction unit axis'
+))
+
+# plot the point_to_cluster2_mean vector in 2D
+fig.add_trace(go.Scatter(
+    x=[original_2d[0], cluster2_mean_2d[0]],
+    y=[original_2d[1], cluster2_mean_2d[1]],
+    mode='lines+markers',
+    marker=dict(size=2, color='purple'),
+    line=dict(color='purple', width=2, dash='dash'),
+    name='point_to_cluster2_mean',
+))
+
+# Plot the shifted point
+fig.add_trace(go.Scatter(
+    x=[shifted_2d[0]],
+    y=[shifted_2d[1]],
+    mode='markers',
+    marker=dict(size=6, color='orange', symbol='diamond'),
+    name='shifted_point'
+))
+
+# Add an arrow from original_pont to shifted_point
+fig.add_trace(go.Scatter(
+    x=[original_2d[0], shifted_2d[0]],
+    y=[original_2d[1], shifted_2d[1]],
+    mode='lines+markers',
+    marker=dict(size=2, color='black'),
+    line=dict(color='black', width=2, dash='dash'),
+    name='shift_vector'
+))
+
+fig.update_layout(
+    xaxis_title='PC1',
+    yaxis_title='PC2',
+    margin=dict(l=0, r=0, b=0, t=0)
+)
+fig.show()
+
+# %%
\ No newline at end of file
diff --git a/notes/umap.py b/notes/umap.py
new file mode 100644
index 0000000..fe60f8e
--- /dev/null
+++ b/notes/umap.py
@@ -0,0 +1,153 @@
+# %%
+
+"""
+Personal note:
+1. What is UMAP?
+
+   UMAP (Uniform Manifold Approximation and Projection)
+   is a nonlinear dimensionality reduction technique.
+   It aims to preserve both local and some global
+   structure of high-dimensional data
+   when projecting it to a lower-dimensional space
+   (e.g., 2D for visualization). UMAP constructs
+   a weighted graph representing the data's manifold
+   structure, then optimizes a low-dimensional
+   embedding to preserve this structure as much as
+   possible. It is widely used for visualization and clusteirng,
+   and is generally faster and more scalable than t-SNE.
+
+2. Example: UMAP by Hand (3D to 2D) and Validation
+
+   UMAP is complex and cannot be fully implemented
+   in a few lines (it involves fuzzy simplicial sets,
+   neighbor graphs, and stochastic optimization).
+   However, you can mimic the spirit of UMAP using
+   simple neighbor-preserving projection, then compare
+   with the real UMAP.
+
+   Step 1: "Manual" Neighbor-pPreserving Projection
+   (for illustration)
+
+   Suppose you have 3D points in a cluster and
+   want to project to 2D, preserving nearest neighbors:
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+import umap
+
+# Generate synthetic 3D data (e.g., a noisy spiral)
+np.random.seed(42)
+t = np.linspace(0, 4 * np.pi, 200)
+x = np.sin(t)
+y = np.cos(t)
+z = t + 0.1 * np.random.randn(200)
+data_3d = np.vstack([x, y, z]).T
+
+# "Manual" projection: project onto the plane defined by the first two principal axes (like PCA)
+from sklearn.decomposition import PCA
+pca = PCA(n_components=2)
+manual_2d = pca.fit_transform(data_3d)
+
+print(f"PCA Explained Variance Ratio: {pca.explained_variance_ratio_}")
+
+# UMAP projection
+umap_2d = umap.UMAP(n_components=2, random_state=42).fit_transform(data_3d)
+
+# Plot both
+fig, axs = plt.subplots(1, 2, figsize=(10, 4))
+axs[0].scatter(manual_2d[:, 0], manual_2d[:, 1], c=t, cmap='viridis')
+axs[0].set_title('Manual (PCA-like) 2D Projection')
+axs[1].scatter(umap_2d[:, 0], umap_2d[:, 1], c=t, cmap='viridis')
+axs[1].set_title('UMAP 2D Projection')
+plt.show()
+
+"""
+   - The manual projection (using PCA) simply projects
+     onto the axes of maximum variance.
+   - UMAP will try to preserve local neighbor relationships,
+     possibly revealing more structure
+
+3. Does UMAP have an "explained variance ratio" like
+   PCA?
+
+   No, UMAP does not provide an explained variance ratio.
+   - PCA is a linear method and each principal component
+     explains a portion of the total variance, which
+     can be quantified
+   - UMAP is nonlinear and focuses on preserving local
+     structure (neighborhoods), not maximizing variance.
+     There is no direct analog to explained variance ratio
+     in UMAP.
+   - If you need to quantify how much information
+     is preserved, you can use trustworthiness,
+     continuity, or reconstruction error,
+     but these are not provided by default in UMAP.
+
+Summary:
+- UMAP is for nonlinear neighbor-preserving embedding,
+  not variance maximization.
+- No explained variance ratio is available for UMAP.
+"""
+
+"""
+Personal note:
+1. What do you mean by "It aims to preserve both local and some global structure of high-dimensional data when projecting it into a lower-dimensional space"? What do you mean by local and global structure? What is local and global structure?
+   
+   Local structure refers to the relationships between
+   each data point and its nearest neighbors–how close
+   or similar points are to each other in small
+   neighborhoods.
+   - Example: If two points are very close in the
+     original high-dimensional space, a method
+     that preserves local structure will keep them
+     close in the low-dimensional embedding
+
+   Global structure refers to the overall arangement
+   and relationships between distant groups or
+   clusters in the data–the "big picture" of how all
+   points and clusters relate to each other
+   - Example: if there are three well-separated
+     clusters in high-dimensional space, a method that
+     preserves global structure will keep those clusters
+     separated and in roughly the same arrangement
+     in the low-dimensional embedding
+
+   In summary:
+   - Local structure: Small-scale relationships (nearest
+     neighbors, small clusters).
+   - Global structure: Large-scale relationships
+     (cluster separation, overall data geometry).
+
+   UMAP tries to preerve botH:
+   - It keeps neighbors together (local structure)
+   - It also tries to maintain the relative
+     positions of clusters or groups (some global
+     structure), though not as strictly as local structure
+"""
+
+"""
+Personal note:
+1. Does PCA aims to preserve both local and some global structure of high-dimensional data when projecting it into a lower-dimensional space too?
+   PCA primarily aims to preserve global structure–it
+   projects data onto new axes (principal components)
+   that capture the most variance in the entire dataset.
+   This means PCA tries to maintain the overall shape,
+   spread, and large-scale relationships of the data.
+
+   However, PCA does not explicitly preserve local
+   structure (the relationships between nearest
+   neighbors). Sometimes, local structure is preserved
+   as a side effect if it aligns with the directions
+   of high variance, but PCA does not optimize for local
+   neighborhoods
+
+   Summary:
+   - PCA: Preserves global structure (variance, overall
+     geometry), but not specifically local structure.
+   - UMAP: Explicitly preserves local structure 
+     (neighbor relationships) and tries to maintain 
+     some global structure
+"""
+
+# %%
\ No newline at end of file
diff --git a/src/plot_pca.py b/src/plot_pca.py
index 38c7730..b1c6b35 100644
--- a/src/plot_pca.py
+++ b/src/plot_pca.py
@@ -1,4 +1,5 @@
 # %%
+
 import os
 import json
 import torch
@@ -10,7 +11,7 @@
 
 # Assuming these utils are in the same directory or accessible
 from utils_load_data import load_embeds, load_split_paragraphs
-from utils_sonar import load_tokenizer # Or use the alternative below
+# from utils_sonar import load_tokenizer # Or use the alternative below
 
 # --- Alternative Tokenizer Loading ---
 # try:
@@ -32,6 +33,8 @@
 import phate # Import PHATE
 import plotly.express as px # For interactive plots
 
+# %%
+
 # --- Configuration ---
 unit = "Characters" # Or "Tokens"
 MAX_FILES_TO_LOAD = 100 # Adjust as needed for memory/time
@@ -42,6 +45,8 @@
 INTERACTIVE_SUBSAMPLE_SIZE = 2000
 RANDOM_STATE = 42 # For reproducibility
 
+# %%
+
 # --- Load Tokenizer ---
 # tokenizer = load_tokenizer() # Uncomment if using the alternative loading
 
@@ -106,6 +111,7 @@ def load_data_for_analysis(max_files=MAX_FILES_TO_LOAD):
     # Return numpy array for vectors, lists/numpy arrays for metadata
     return vec_array, all_texts, np.array(all_lengths), np.array(all_labels)
 
+# %%
 
 # --- Plotting Functions ---
 def plot_dimensionality_reduction_static(results, labels, lengths, title):
@@ -240,7 +246,8 @@ def run_tsne_analysis(data, texts, labels, lengths):
     print("\nRunning t-SNE...")
     tsne_model = TSNE(
         n_components=2, perplexity=30, learning_rate='auto',
-        n_iter=300, init='pca', random_state=RANDOM_STATE,
+        n_iter_without_progress=300, 
+        init='pca', random_state=RANDOM_STATE,
         n_jobs=-1, verbose=1
     )
     # Fit t-SNE on the potentially pre-subsampled data passed to this function
@@ -297,6 +304,59 @@ def run_phate_analysis(data, texts, labels, lengths):
         )
     return phate_results
 
+# %%
+
+"""
+Personal note:
+1. You are an expert in Principal Component 
+   Analysis (PCA), Uniform Manifold Approximation 
+   and Projection (UMAP), t-distributed stochastic 
+   neighbor embedding (t-SNE), 
+   Potential of Heat-diffusion for Affinity-based 
+   Trajectory Embedding (PHATE). What is the goal 
+   of PCA? What is the goal of UMAP? What is the 
+   goal of t-SNE? What is the goal of PHATE?
+
+   PCA (Principal Component Analysis):
+   The goal of PCA is to reduce dimensionality of
+   data by finding new orthogonal axes
+   (principal components) that capture the maximum
+   variance in the data. It projects the data onto
+   these axes, allowing you to represent high-
+   dimensional data in fewer dimensions
+   (often 2 or 3) while preserving as much
+   as information (variance) as possible
+
+   UMAP (Uniform Manifold Approximation and Projection):
+   UMAP aims to reduce dimensionality while preserving
+   both local and some global structure of the data.
+   It is based on manifold learning and constructs
+   a high-dimensional graph,
+   then optimizes a low-dimensional graph to be
+   as structurally similar as possible.
+   UMAP is often used for visualization and clustering,
+   and is faster and more scalable than t-SNE
+
+   t-SNE (t-distributed Stochastic Neighbor Embedding):
+   The goal of t-SNE is to visualize high-dimensional
+   data by reducing it to two or three dimensions,
+   focusing on preserving local similarities (i.e.,
+   points that are close in high-dimensional space
+   remain close in the low-dimensional embedding).
+   It is particularly good for visualizing clusters,
+   but does not preserve global structure well.
+
+   PHATE (Potential of Heat-difussion for Affinity-
+   based Trajectory Embedding):
+   PHATE is designed to capture both local and
+   global nonlinear structure in high-dimensional
+   data, especially for data with continuous trajectories
+   (e.g., biological processes). It uses diffusion
+   geometry to model transitions and relationships,
+   producing embeddings that reveal both clusters
+   and progression/trajectory patterns
+"""
+
 # --- Main Execution ---
 if __name__ == "__main__":
     # 1. Load Data (including texts now)
@@ -334,10 +394,6 @@ def run_phate_analysis(data, texts, labels, lengths):
 
     print("\nAnalysis complete.")
 
-# %%
-
-print("hello")
-
 # %%
 def plot_dimensionality_reduction_interactive_length(results, texts, labels, lengths, title):
     """Plots interactive results using Plotly, coloring by length."""