diff --git a/.gitignore b/.gitignore index 353dc37..8d2d88b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ __pycache__ sae_models data src/env + +lightning_logs \ No newline at end of file diff --git a/experiments/data_preprocess/dp_utils/__init__.py b/experiments/data_preprocess/dp_utils/__init__.py new file mode 100644 index 0000000..2cfe908 --- /dev/null +++ b/experiments/data_preprocess/dp_utils/__init__.py @@ -0,0 +1 @@ +from .task import * \ No newline at end of file diff --git a/experiments/data_preprocess/dp_utils/task.py b/experiments/data_preprocess/dp_utils/task.py new file mode 100644 index 0000000..e9d3547 --- /dev/null +++ b/experiments/data_preprocess/dp_utils/task.py @@ -0,0 +1,5 @@ +from enum import Enum + +class Task(Enum): + ADD = "ADD" + DIFFERENCE = "DIFFERENCE" \ No newline at end of file diff --git a/experiments/data_preprocess/main.py b/experiments/data_preprocess/main.py new file mode 100644 index 0000000..a08be6f --- /dev/null +++ b/experiments/data_preprocess/main.py @@ -0,0 +1,1571 @@ +# %% [markdown] +""" +Problem 1: +Computing the ROUGE score requires both predicted +and target texts. However, generating the predicted +text involves processing embeddings which, after +passing through an untrained linear transformation, +can cause the model to produce excessively long +outputs. For example, instead of the expected +"hello", the model might generate "It is an +integral part of the conversation." As a result, +processing 16 embeddings takes approximately 10 +seconds on average. +""" + +# %% + +# !pip install "datasets>=3,<4" + +# %% + +import os +import sys + +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) +if project_root not in sys.path: + print(f"Adding project root to sys.path: {project_root}") + sys.path.append(project_root) + +# %% + +from dp_utils import Task + +from argparse import ArgumentParser +from typing import TypedDict + +from datasets import load_from_disk +from datasets import load_dataset + +import random +from dataclasses import dataclass + +from jaxtyping import Float + +import torch as t +import torch.nn as nn +from torch import Tensor +from torch import distributed as dist +import torch.nn.functional as F +from torch.utils.data import ( + IterableDataset, + DataLoader, + DistributedSampler, +) + +from pytorch_lightning import ( + LightningDataModule, + LightningModule, + Trainer, +) +from pytorch_lightning.strategies import DDPStrategy +from pytorch_lightning.utilities import rank_zero_info +from pytorch_lightning.callbacks import ModelCheckpoint + +from transformers.modeling_outputs import ModelOutput +from transformers.optimization import get_linear_schedule_with_warmup + +from sonar.inference_pipelines.text import ( + TextToEmbeddingModelPipeline, + EmbeddingToTextModelPipeline, +) + +from fairseq2 import setup_fairseq2 +from fairseq2.data import Collater +from fairseq2.models.sequence import SequenceBatch +from fairseq2.nn.padding import get_seqs_and_padding_mask +from fairseq2.data.text.tokenizers import get_text_tokenizer_hub + +import numpy as np + +# from torchmetrics.text.rouge import ROUGEScore + +# from time import sleep # TODO: remove this later + +# %% + +print("Setting float32 matmul precision to 'high'...") # For better performance +t.set_float32_matmul_precision('high') + +# %% + +print("Setting up fairseq2...") +setup_fairseq2() + +# %% + +class Args(TypedDict): + train: bool + checkpoint_path: str | None + + data_dir: str + default_root_dir: str + accelerator: str + strategy: str + devices: list[int] + batch_size: int + lr: float + num_warmup_steps: int + max_steps: int + """ + ``` + max_steps = ( + num_samples + // (devices * batch_size) + * max_epochs + ) + ``` + For example, `( 989944 // (3 * 8) + 989944 // (3 * 8)) * 10 = 824940 steps` + """ + +def parse_args() -> Args: + parser = ArgumentParser(description="Data Preprocessing Script") + parser.add_argument( + "--train", + action="store_true", + help="Run in training mode.", + ) + parser.add_argument( + "--data_dir", + type=str, + required=True, + help="Path to the workspace directory where data will be processed.", + ) + parser.add_argument( + "--checkpoint_path", + type=str, + default=None, + help="Path to the model checkpoint for loading.", + ) + parser.add_argument( + "--default_root_dir", + type=str, + help="Default root directory for PyTorch Lightning logs.", + ) + parser.add_argument( + "--accelerator", + type=str, + help="Type of accelerator to use (e.g., 'gpu', 'cpu').", + ) + parser.add_argument( + "--strategy", + type=str, + help="Distributed training strategy (e.g., 'ddp', 'dp').", + ) + parser.add_argument( + "--devices", + nargs="+", + type=int, + ) + parser.add_argument( + "--batch_size", + type=int, + help="Batch size per GPU for training.", + ) + parser.add_argument( + "--lr", + type=float, + help="Learning rate for the optimizer.", + ) + parser.add_argument( + "--num_warmup_steps", + type=int, + help="Number of warmup steps for the learning rate scheduler.", + ) + parser.add_argument( + "--max_steps", + type=int, + help="Total number of training steps.", + ) + args = parser.parse_args() + return Args(**vars(args)) + +def is_notebook(): + try: + __IPYTHON__ # type: ignore + return True + except NameError: + return False + +if is_notebook(): + WORKSPACE_DIR = "/workspace/ALGOVERSE/UJR/jason" + DEFAULT_ROOT_DIR = f"{WORKSPACE_DIR}/experiments/data_preprocess" + DATA_DIR = f"{WORKSPACE_DIR}/data" + sys.argv = [ + 'main.py', + # '--train', + '--data_dir', DATA_DIR, + '--default_root_dir', DEFAULT_ROOT_DIR, + '--accelerator', 'gpu', + '--strategy', 'auto', + '--devices', '2', '3', + '--batch_size', '8', + '--lr', '1e-4', + '--num_warmup_steps', '10', + '--max_steps', '20', + ] + + CHECKPOINT_PATH = f"{DEFAULT_ROOT_DIR}/lightning_logs/version_1/checkpoints/epoch=0-step=123250-val_loss_epoch=0.7371.ckpt" + sys.argv += ['--checkpoint_path', CHECKPOINT_PATH] + +args = parse_args() + +print("Parsed arguments:") +for key, value in args.items(): + print(f"{key}: {value}") + +# %% + +class DataCollator: + def __init__(self, pad_idx: int): + self.pad_idx = pad_idx + + def __call__(self, batch): + input_a = [t.tensor(item["input_a"]) for item in batch] + input_b = [t.tensor(item["input_b"]) for item in batch] + input_target = [t.tensor(item["input_target"]) for item in batch] + + # max_len_input_a = max(l.size(0) for l in input_a) + # max_len_input_b = max(l.size(0) for l in input_b) + # max_len_input_target = max(l.size(0) for l in input_target) + # max_len = max(max_len_input_a, max_len_input_b, max_len_input_target) + + # input_a_padded = [] + # for seq in input_a: + # if seq.size(0) < max_len: + # pad_amt = max_len - seq.size(0) + # seq = F.pad(seq, (0, pad_amt), value=self.pad_idx) + # input_a_padded.append(seq) + # input_a_padded = t.stack(input_a_padded, dim=0) + + # input_b_padded = [] + # for seq in input_b: + # if seq.size(0) < max_len: + # pad_amt = max_len - seq.size(0) + # seq = F.pad(seq, (0, pad_amt), value=self.pad_idx) + # input_b_padded.append(seq) + # input_b_padded = t.stack(input_b_padded, dim=0) + + # input_target_padded = [] + # for seq in input_target: + # if seq.size(0) < max_len: + # pad_amt = max_len - seq.size(0) + # seq = F.pad(seq, (0, pad_amt), value=self.pad_idx) + # input_target_padded.append(seq) + # input_target_padded = t.stack(input_target_padded, dim=0) + + return { + "input_a": input_a, + "input_b": input_b, + "input_target": input_target, + # "input_a": input_a_padded, + # "input_b": input_b_padded, + # "input_target": input_target_padded, + } + +class TaskDataModule(LightningDataModule): + def __init__( + self, + data_dir: str, + task: Task, + batch_size: int, + ): + super().__init__() + + self.data_dir = data_dir + self.task = task + self.batch_size = batch_size + + tokenizer_hub = get_text_tokenizer_hub() + self.tokenizer = tokenizer_hub.load( + name_or_card="text_sonar_basic_encoder", + ) + self.collator = DataCollator(pad_idx=self.tokenizer.vocab_info.pad_idx) + + self.train_dataset = None + self.val_dataset = None + self.test_dataset = None + + def prepare_data(self): + # download, split, etc... + # only called on 1 GPU/TPU in distributed + self._prepare_data_wiki_split() + + def setup(self, stage): + self._setup_load_from_disk( + stage=stage, + path="google-research-datasets/wiki_split", + ) + + def train_dataloader(self): + """ + Reference: + [1] https://lightning.ai/docs/pytorch/stable/common/trainer.html#use-distributed-sampler + """ + sampler = ( + DistributedSampler(self.train_dataset, shuffle=True) + if dist.is_available() and dist.is_initialized() + else None + ) + dataloader = TaskDataLoader( + task=self.task, + sampler=sampler, + shuffle=sampler is None, + dataset=self.train_dataset, + batch_size=self.batch_size, + drop_last=True, + collate_fn=self.collator, + num_workers=os.cpu_count() // 2 if os.cpu_count() > 1 else 1, + ) + return dataloader + + def val_dataloader(self): + sampler = ( + DistributedSampler(self.val_dataset, shuffle=False) + if dist.is_available() and dist.is_initialized() + else None + ) + dataloader = TaskDataLoader( + task=self.task, + sampler=sampler, + shuffle=sampler is None, + dataset=self.val_dataset, + batch_size=self.batch_size, + drop_last=False, + collate_fn=self.collator, + num_workers=os.cpu_count() // 2 if os.cpu_count() > 1 else 1, + ) + return dataloader + + def test_dataloader(self): + sampler = ( + DistributedSampler(self.test_dataset, shuffle=False) + if dist.is_available() and dist.is_initialized() + else None + ) + dataloader = TaskDataLoader( + task=self.task, + sampler=sampler, + shuffle=sampler is None, + dataset=self.test_dataset, + batch_size=self.batch_size, + drop_last=False, + collate_fn=self.collator, + num_workers=os.cpu_count() // 2 if os.cpu_count() > 1 else 1, + ) + return dataloader + + def _prepare_data_wiki_split(self): + def preprocess(item): + complex_sentence = item['complex_sentence'] + simple_sentence_1 = item['simple_sentence_1'] + simple_sentence_2 = item['simple_sentence_2'] + + flip = random.random() < 0.5 + + match self.task: + case Task.ADD: + input_a = simple_sentence_1 + input_b = simple_sentence_2 + input_target = complex_sentence + case Task.DIFFERENCE: + input_a = complex_sentence + if flip: + input_b = simple_sentence_2 + input_target = simple_sentence_1 + else: + input_b = simple_sentence_1 + input_target = simple_sentence_2 + case _: + raise ValueError(f"Unknown task: {task}") + + tokenizer_encoder = self.tokenizer.create_encoder( + lang="eng_Latn", + ) + input_a_tokens = tokenizer_encoder(input_a) + input_b_tokens = tokenizer_encoder(input_b) + input_target_tokens = tokenizer_encoder(input_target) + + return { + # "input_a": input_a, + # "input_b": input_b, + # "input_target": input_target, + "input_a": input_a_tokens, + "input_b": input_b_tokens, + "input_target": input_target_tokens, + } + + try: + load_from_disk( + os.path.join( + self.data_dir, + "google-research-datasets", + "wiki_split", + "processed", + f"{self.task.value}_train", + ) + ) + load_from_disk( + os.path.join( + self.data_dir, + "google-research-datasets", + "wiki_split", + "processed", + f"{self.task.value}_val", + ) + ) + load_from_disk( + os.path.join( + self.data_dir, + "google-research-datasets", + "wiki_split", + "processed", + f"{self.task.value}_test", + ) + ) + return # TODO: uncomment this later + except FileNotFoundError: + print(f"Data for task {self.task.value} not found, downloading and processing...") + + dataset = load_dataset( + path="google-research-datasets/wiki_split", + ) + + train_dataset = dataset['train'].map( + preprocess, + remove_columns=dataset['train'].column_names, + # batched=True, + # batch_size=5, + ) + + val_dataset = dataset['validation'].map( + preprocess, + remove_columns=dataset['validation'].column_names, + # batched=True, + # batch_size=5, + ) + + test_dataset = dataset['test'].map( + preprocess, + # batched=True, + # batch_size=5, + remove_columns=dataset['test'].column_names, + ) + + train_dataset.save_to_disk( + os.path.join( + self.data_dir, + "google-research-datasets", + "wiki_split", + "processed", + f"{self.task.value}_train", + ) + ) + val_dataset.save_to_disk( + os.path.join( + self.data_dir, + "google-research-datasets", + "wiki_split", + "processed", + f"{self.task.value}_val", + ), + ) + test_dataset.save_to_disk( + os.path.join( + self.data_dir, + "google-research-datasets", + "wiki_split", + "processed", + f"{self.task.value}_test", + ), + ) + + def _setup_load_from_disk( + self, + stage: str, + path: str, + ): + if stage == "fit": + self.train_dataset = load_from_disk( + os.path.join( + self.data_dir, + path, + "processed", + f"{self.task.value}_train", + ) + ) + self.val_dataset = load_from_disk( + os.path.join( + self.data_dir, + path, + "processed", + f"{self.task.value}_val", + ) + ) + rank_zero_info(f"Loaded {path} {self.task.value} train dataset with {len(self.train_dataset)} samples.") + rank_zero_info(f"Loaded {path} {self.task.value} val dataset with {len(self.val_dataset)} samples.") + elif stage == "test": + self.test_dataset = load_from_disk( + os.path.join( + self.data_dir, + path, + "processed", + f"{self.task.value}_test", + ) + ) + rank_zero_info(f"Loaded {path} {self.task.value} test dataset with {len(self.test_dataset)} samples.") + else: + raise ValueError(f"Unknown stage: {stage}") + + def teardown(self, stage): + if stage == "fit": + self.train_dataset = None + self.val_dataset = None + rank_zero_info("Teardown: train and val datasets set to None.") + elif stage == "test": + self.test_dataset = None + rank_zero_info("Teardown: test dataset set to None.") + else: + raise ValueError(f"Unknown stage: {stage}") + +class TaskDataLoader(DataLoader): + def __init__(self, task: Task, *args, **kwargs): + super().__init__(*args, **kwargs) + self.task = task + + def __iter__(self): + for batch in super().__iter__(): + batch["task"] = self.task + yield batch + +class MultiTaskDataModule(LightningDataModule): + def __init__( + self, + data_dir: str, + tasks: list[Task], + batch_size: int, + ): + super().__init__() + self.datasets: dict[Task, TaskDataModule] = {} + for task in tasks: + self.datasets[task] = TaskDataModule( + data_dir=data_dir, + task=task, + batch_size=batch_size, + ) + + def prepare_data(self): + for dataset in self.datasets.values(): + dataset.prepare_data() + + def setup(self, stage): + for dataset in self.datasets.values(): + dataset.setup(stage=stage) + + def teardown(self, stage): + for dataset in self.datasets.values(): + dataset.teardown(stage=stage) + + def train_dataloader(self): + dataset = MultiTaskIterableDataset( + dataloader_dict={ + task: dataset.train_dataloader() + for task, dataset + in self.datasets.items() + } + ) + dataloader = DataLoader( + dataset=dataset, + collate_fn=lambda x: x[0], # Prevent extra tuple wrapping + ) + return dataloader + + def val_dataloader(self): + dataset = MultiTaskIterableDataset( + dataloader_dict={ + task: dataset.val_dataloader() + for task, dataset + in self.datasets.items() + } + ) + dataloader = DataLoader( + dataset=dataset, + collate_fn=lambda x: x[0], # Prevent extra tuple wrapping + ) + return dataloader + + def test_dataloader(self): + dataset = MultiTaskIterableDataset( + dataloader_dict={ + task: dataset.test_dataloader() + for task, dataset + in self.datasets.items() + } + ) + dataloader = DataLoader( + dataset=dataset, + collate_fn=lambda x: x[0], # Prevent extra tuple wrapping + ) + return dataloader + +class MultiTaskIterableDataset(IterableDataset): + def __init__( + self, + dataloader_dict: dict[str, DataLoader], + ): + super().__init__() + self.dataloader_dict = dataloader_dict + self.task_list = list(dataloader_dict.keys()) + + # self.dataloader_dict = dataloader_dict + # self.num_batches_dict = { + # task: len(dataloader) + # for task, dataloader in dataloader_dict.items() + # } + # self.task_list = list(dataloader_dict.keys()) + # self.world_size = 1 + # self.rank = 0 + # if dist.is_initialized(): + # self.world_size = dist.get_world_size() + # self.rank = dist.get_rank() + + # def __len__(self): + # total_batches = sum(self.num_batches_dict.values()) + # return total_batches + + def __len__(self): + total_batches = sum(len(dl) for dl in self.dataloader_dict.values()) + return total_batches + + def __iter__(self): + dataloader_dicts = { + task: iter(dataloader) + for task, dataloader + in self.dataloader_dict.items() + } + while True: + for task in self.task_list: + try: + yield next(dataloader_dicts[task]) + except StopIteration: + self.task_list.remove(task) + if not self.task_list: + return + + # all_batches = [] + # for dataloader in self.dataloader_dict.values(): + # all_batches.extend(iter(dataloader)) + + # random.shuffle(all_batches) + + # for i in range(0, len(all_batches)): + # yield all_batches[i] + +class Add(nn.Module): + def __init__(self, n_embd: int): + super().__init__() + self.linear = nn.Linear(2*n_embd, n_embd) + + def forward( + self, + x: Float[Tensor, "batch_size 2*n_embd"], + ): + x = self.linear(x) + return x + +class Difference(nn.Module): + def __init__(self, n_embd: int): + super().__init__() + self.linear = nn.Linear(2*n_embd, n_embd) + + def forward( + self, + x: Float[Tensor, "batch_size 2*n_embd"], + ): + x = self.linear(x) + return x + +@dataclass +class EncoderModelOutput(ModelOutput): + embeddings_sim: Float[Tensor, "batch_size n_embd"] + embeddings_pos: Float[Tensor, "batch_size n_embd"] | None = None + # target_seqs: list[str] | None = None + +class EncoderModel(nn.Module): + def __init__( + self, + ): + super().__init__() + self.encoder = TextToEmbeddingModelPipeline( + tokenizer="text_sonar_basic_encoder", + encoder="text_sonar_basic_encoder", + ) + + self.n_embd = 1024 + + # self.norm1 = nn.RMSNorm(self.n_embd) + self.add = Add(n_embd=self.n_embd) + self.difference = Difference(n_embd=self.n_embd) + # self.norm2 = nn.RMSNorm(self.n_embd) + + for param in self.encoder.model.parameters(): + param.requires_grad = False + + def forward( + self, + task: Task, + input_a: list[Float[Tensor, "seq_len"]], + input_b: list[Float[Tensor, "seq_len"]], + input_target: list[Float[Tensor, "seq_len"]] | None = None, + ): + # Version 1: input_a, input_b, input_target are list[str] + # input = input_a + input_b + # if input_target is not None: + # input += input_target + + # device = next(self.encoder.model.parameters()).device + # tokenizer_encoder = self.encoder.tokenizer.create_encoder( + # lang="eng_Latn", + # device=device, + # ) + + # seqs = [ + # tokenizer_encoder(seq) + # for seq in input + # ] + # collater = Collater( + # pad_value=self.encoder.tokenizer.vocab_info.pad_idx, + # ) + # batch = collater(seqs) + # tokens, padding_mask = get_seqs_and_padding_mask( + # data=batch, + # device=device, + # ) + + # Version 2: input_a, input_b, input_target are list[Float[Tensor, "seq_len"]] + seqs = input_a + input_b + if input_target is not None: + seqs += input_target + + collater = Collater( + pad_value=self.encoder.tokenizer.vocab_info.pad_idx, + ) + batch = collater(seqs) + device = next(self.encoder.model.parameters()).device + tokens, padding_mask = get_seqs_and_padding_mask( + data=batch, + device=device, + ) + + sequence_batch = SequenceBatch(tokens, padding_mask) + embeddings = self.encoder.model(sequence_batch).sentence_embeddings + + # embeddings = self.norm1(embeddings) + + batch_size = len(input_a) + embeddings_a = embeddings[0:batch_size] + embeddings_b = embeddings[batch_size:2*batch_size] + embeddings_target = embeddings[2*batch_size:] if input_target is not None else None + + embeddings_a_b = t.cat([embeddings_a, embeddings_b], dim=1) + + match task: + case Task.ADD: + embeddings_sim = self.add(embeddings_a_b) + case Task.DIFFERENCE: + embeddings_sim = self.difference(embeddings_a_b) + case _: + raise ValueError(f"Unknown task: {task}") + + # embeddings_sim = self.norm2(embeddings_sim) + embeddings_pos = embeddings_target + + return EncoderModelOutput( + embeddings_sim=embeddings_sim, + embeddings_pos=embeddings_pos, + # target_seqs=seqs[2*batch_size:] if input_target is not None else None, + ) + +@dataclass +class EncoderDecoderModelOutput(ModelOutput): + embeddings_sim: Float[Tensor, "batch_size n_embd"] + contrastive_loss: Float[Tensor, ""] | None = None + reconstruction_loss: Float[Tensor, ""] | None = None + loss: Float[Tensor, ""] | None = None + embeddings_pos: Float[Tensor, "batch_size n_embd"] | None = None + # decoded_outputs: list[str] | None = None + # decoded_targets: list[str] | None = None + +class EncoderDecoderModel(nn.Module): + def __init__( + self, + ): + super().__init__() + self.encoder = EncoderModel() + self.decoder = EmbeddingToTextModelPipeline( + tokenizer="text_sonar_basic_decoder", + decoder="text_sonar_basic_decoder", + ) + + for param in self.decoder.model.parameters(): + param.requires_grad = False + + def forward( + self, + task: Task, + input_a: list[Float[Tensor, "seq_len"]], + input_b: list[Float[Tensor, "seq_len"]], + input_target: list[Float[Tensor, "seq_len"]] | None = None, + # return_decoded: bool = False, + ): + encoder_output = self.encoder( + task=task, + input_a=input_a, + input_b=input_b, + input_target=input_target, + ) + + embeddings_sim = encoder_output.embeddings_sim # shape: (batch_size, 2, n_embd) + embeddings_pos = encoder_output.embeddings_pos + # target_seqs = encoder_output.target_seqs + + output_kwargs = { + "embeddings_sim": embeddings_sim, + } + + # if return_decoded: + # batch_size = embeddings_sim.shape[0] + # inputs = ( + # t.cat( + # [embeddings_sim, embeddings_pos], + # dim=0, + # ) + # if embeddings_pos is not None + # else embeddings_sim + # ) + # decoded = self.decoder.predict( + # inputs=inputs, + # target_lang="eng_Latn", + # max_seq_len=512, + # ) + # decoded_outputs = decoded[:batch_size] + # decoded_targets = ( + # decoded[batch_size:] + # if embeddings_pos is not None + # else None + # ) + # output_kwargs.update({ + # "decoded_outputs": decoded_outputs, + # "decoded_targets": decoded_targets, + # }) + + if input_target is not None: + cl_loss = self.compute_contrastive_loss( + embeddings_sim=embeddings_sim, + embeddings_pos=embeddings_pos, + ) + gen_loss = self.compute_reconstruction_loss( + input_target=input_target, + embeddings_sim=embeddings_sim, + ) + batch_size = embeddings_sim.shape[0] + gen_loss_weight = -np.log(1/batch_size) / -np.log(1/self.decoder.tokenizer.vocab_info.size) + # cl_loss random loss is -ln(1/batch_size) and + # gen_loss random loss is -ln(1/vocab_size) + loss = cl_loss + gen_loss * gen_loss_weight + output_kwargs.update({ + "contrastive_loss": cl_loss, + "reconstruction_loss": gen_loss, + "loss": loss, + "embeddings_pos": embeddings_pos, + }) + + return EncoderDecoderModelOutput(**output_kwargs) + + def compute_contrastive_loss( + self, + embeddings_sim: Float[Tensor, "batch_size n_embd"], + embeddings_pos: Float[Tensor, "batch_size n_embd"] + ): + # Contrastive learning loss + sim_fct = nn.CosineSimilarity(dim=-1) + temperature = 0.05 + cos_sim = sim_fct( + embeddings_sim.unsqueeze(dim=1), # shape: (batch_size, 1, n_embd) + embeddings_pos.unsqueeze(dim=0), # shape: (1, batch_size, n_embd) + ) / temperature + batch_size = embeddings_sim.shape[0] + cl_labels = t.arange(batch_size, device=cos_sim.device) + loss_fct = nn.CrossEntropyLoss() + cl_loss = loss_fct(cos_sim, cl_labels) + return cl_loss + + def compute_reconstruction_loss( + self, + input_target: list[Float[Tensor, "seq_len"]], + embeddings_sim: Float[Tensor, "batch_size n_embd"], + ): + # Generation loss + loss_fct = nn.CrossEntropyLoss() + + device = next(self.decoder.model.parameters()).device + input_to_decoder = [ + t.cat([ + t.tensor( + [ + self.decoder.tokenizer.vocab_info.eos_idx, + ], + device=device, + ), + seq[:-1], # Remove last token (EOS) + # seq[1:-1], # Remove first and last token (language token and EOS) + ]) + for seq in input_target + ] + collater = Collater( + pad_value=self.decoder.tokenizer.vocab_info.pad_idx, + ) + input_data = collater(input_to_decoder) + input_tensor, input_padding_mask = get_seqs_and_padding_mask( + data=input_data, + device=device, + ) + decoder_output, decoder_padding_mask = self.decoder.model.decode( + seqs=input_tensor, + padding_mask=input_padding_mask, + encoder_output=embeddings_sim.unsqueeze(1), + encoder_padding_mask=None, + ) + + model_output = self.decoder.model.project( + decoder_output=decoder_output, + decoder_padding_mask=decoder_padding_mask, + ) + + logits = model_output.logits + + labels = input_target + # labels = [ + # seq + # # seq[1:] # Remove first token (language token) + # for seq in input_target + # ] + + max_len_logits = logits.size(1) + max_len_labels = max(l.size(0) for l in labels) + max_len = max(max_len_logits, max_len_labels) + + logits_padded = logits + if logits.size(1) < max_len: + pad_amt = max_len - logits.size(1) + logits_padded = F.pad( + input=logits, + pad=(0, 0, 0, pad_amt), + value=self.decoder.tokenizer.vocab_info.pad_idx, + ) + + labels_padded = [] + for label in labels: + if label.size(0) < max_len: + pad_amt = max_len - label.size(0) + label = F.pad( + input=label, + pad=(0, pad_amt), + value=self.decoder.tokenizer.vocab_info.pad_idx, + ) + labels_padded.append(label) + labels_padded = t.stack(labels_padded, dim=0) + + gen_loss = loss_fct( + logits_padded.reshape(-1, logits_padded.size(-1)), # (batch_size * seq_len, vocab_size) + labels_padded.reshape(-1) # (batch_size * seq_len) + ) + return gen_loss + +class LitModel(LightningModule): + def __init__( + self, + lr: float, + num_warmup_steps: int, + num_training_steps: int, + ): + super().__init__() + self.save_hyperparameters() + + self.lr = lr + self.num_warmup_steps = num_warmup_steps + self.num_training_steps = num_training_steps + + self.model = EncoderDecoderModel() + + # self.dummy_parameter = nn.Parameter(t.tensor(0.0, requires_grad=True)) + + self.strict_loading = False + # This is a hack which is required + # because checkpoint['state_dict'] + # is modified in on_save_checkpoint + # to reduce checkpoint file size. + # In the future, instead of modifying + # checkpoint['state_dict'], we can + # pass the encoder and decoder + # in the argument, and use + # self.save_hyperparameters(ignore) + + def on_fit_start(self): + self.fix_device() + + def on_validation_start(self): + self.fix_device() + + def on_test_start(self): + self.fix_device() + + def fix_device(self): + device = next(self.model.decoder.model.parameters()).device + self.model.decoder.device = device + # This is a hack which is required + # because EmbeddngToTextModelPipeline + # sets the device in the constructor, + # but we need to set it after the model + # is moved to the correct device. + + def get_trainable_state_dict(self): + state = {} + for name, param in self.named_parameters(): + if param.requires_grad: + state[name] = param.data.cpu() + return state + + def on_save_checkpoint(self, checkpoint): + checkpoint['state_dict'] = self.get_trainable_state_dict() + # def print_keys(d, indent=0): + # if isinstance(d, dict): + # for key, value in d.items(): + # rank_zero_info(f"{' ' * indent}Key: {key}") + # print_keys(value, indent=indent+1) + # print_keys(checkpoint) + + def training_step(self, batch): + outputs = self.model( + task=batch['task'], + input_a=batch['input_a'], + input_b=batch['input_b'], + input_target=batch['input_target'], + ) + + cl_loss = outputs.contrastive_loss + gen_loss = outputs.reconstruction_loss + loss = outputs.loss + + batch_size = len(batch['input_a']) + lr = self.trainer.optimizers[0].param_groups[0]['lr'] + self.log( + "lr", + lr, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + ) + self.log( + "train_cl_loss", + cl_loss, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + sync_dist=True, + batch_size=batch_size, + ) + self.log( + "train_gen_loss", + gen_loss, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + sync_dist=True, + batch_size=batch_size, + ) + self.log( + "train_loss", + loss, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + sync_dist=True, + batch_size=batch_size, + ) + return loss + + # print( + # f"[GPU {self.global_rank}] " + # f"batch_idx: {batch_idx}, " + # f"task: {batch['task']}, " + # f"input_a: {batch['input_a']}, " + # f"input_a: {batch['input_a'][0][:30]}, " + # f"input_b: {batch['input_b'][0][:30]}, " + # f"input_target: {batch['input_target'][0][:30]}, " + # ) + # print( + # f"[GPU {self.global_rank}] " + # f"batch_idx: {batch_idx}, " + # f"task: {batch['task']}, " + # f"input_a: {batch['input_a'][1][:30]}, " + # f"input_b: {batch['input_b'][1][:30]}, " + # f"input_target: {batch['input_target'][1][:30]}, " + # ) + # sleep(0.1) + # return t.tensor(0.0, requires_grad=True) + + def vec2text( + self, + embeddings: Float[Tensor, "batch_size vocab_size"], + ): + texts = self.model.decoder.predict( + inputs=embeddings, + target_lang="eng_Latn", + max_seq_len=512, + ) + return texts + + def validation_step(self, batch, batch_idx): + # self.validation_step_outputs = { + # "contrastive_loss": [], + # "reconstruction_loss": [], + # "loss": [], + # # "decoded_targets": [], + # # "decoded_outputs": [], + # } + + outputs = self.model( + task=batch['task'], + input_a=batch['input_a'], + input_b=batch['input_b'], + input_target=batch['input_target'], + # return_decoded=True, + ) + embeddings_sim = outputs.embeddings_sim + embeddings_pos = outputs.embeddings_pos + cl_loss = outputs.contrastive_loss + gen_loss = outputs.reconstruction_loss + loss = outputs.loss + # decoded_outputs = outputs.decoded_outputs + # decoded_targets = outputs.decoded_targets + + # self.validation_step_outputs["contrastive_loss"].append(cl_loss) + # self.validation_step_outputs["reconstruction_loss"].append(gen_loss) + # self.validation_step_outputs["loss"].append(loss) + # self.validation_step_outputs["decoded_outputs"].extend(decoded_outputs) + # self.validation_step_outputs["decoded_targets"].extend(decoded_targets) + + batch_size = len(batch['input_a']) + self.log( + "val_cl_loss_epoch", + cl_loss, + logger=True, + on_step=False, + on_epoch=True, + sync_dist=True, + batch_size=batch_size, + ) + self.log( + "val_gen_loss_epoch", + gen_loss, + logger=True, + on_step=False, + on_epoch=True, + sync_dist=True, + batch_size=batch_size, + ) + self.log( + "val_loss_epoch", + loss, + logger=True, + on_step=False, + on_epoch=True, + sync_dist=True, + batch_size=batch_size, + ) + + if batch_idx == 0 and self.global_rank == 0: + self.log_examples( + tag="val_examples", + input_a=batch['input_a'], + input_b=batch['input_b'], + task=batch['task'], + embeddings_prediction=embeddings_sim, + embeddings_target=embeddings_pos, + ) + + return loss + + # def on_validation_epoch_end(self): + # rank_zero_info("Validation epoch ended, calculating metrics...") + + # outputs = self.validation_step_outputs + # contrastive_loss = t.stack(outputs["contrastive_loss"]).mean() + # reconstruction_loss = t.stack(outputs["reconstruction_loss"]).mean() + # loss = t.stack(outputs["loss"]).mean() + # # decoded_outputs = outputs["decoded_outputs"] + # # decoded_targets = outputs["decoded_targets"] + + # # rouge = ROUGEScore() + # # rouge_score = rouge( + # # preds=decoded_outputs, + # # target=decoded_targets, + # # ) + # # rouge1_fmeasure = rouge_score["rouge1_fmeasure"].to(device=self.device) + # # rouge2_fmeasure = rouge_score["rouge2_fmeasure"].to(device=self.device) + # # rougeL_fmeasure = rouge_score["rougeL_fmeasure"].to(device=self.device) + + # self.log( + # "val_cl_loss_epoch", + # contrastive_loss, + # logger=True, + # on_epoch=True, + # sync_dist=True, + # ) + # self.log( + # "val_gen_loss_epoch", + # reconstruction_loss, + # logger=True, + # on_epoch=True, + # sync_dist=True, + # ) + # self.log( + # "val_loss_epoch", + # loss, + # logger=True, + # on_epoch=True, + # sync_dist=True, + # ) + + # # self.log( + # # "val_rouge1", + # # rouge1_fmeasure, + # # logger=True, + # # sync_dist=True, + # # ) + # # self.log( + # # "val_rouge2", + # # rouge2_fmeasure, + # # logger=True, + # # sync_dist=True, + # # ) + # # self.log( + # # "val_rougeL", + # # rougeL_fmeasure, + # # logger=True, + # # sync_dist=True, + # # ) + + # self.validation_step_outputs.clear() + + def test_step(self, batch, batch_idx): + # self.test_step_outputs = { + # "contrastive_loss": [], + # "reconstruction_loss": [], + # "loss": [], + # # "decoded_targets": [], + # # "decoded_outputs": [], + # } + + outputs = self.model( + task=batch['task'], + input_a=batch['input_a'], + input_b=batch['input_b'], + input_target=batch['input_target'], + # return_decoded=True, + ) + embeddings_sim = outputs.embeddings_sim + embeddings_pos = outputs.embeddings_pos + cl_loss = outputs.contrastive_loss + gen_loss = outputs.reconstruction_loss + loss = outputs.loss + # decoded_outputs = outputs.decoded_outputs + # decoded_targets = outputs.decoded_targets + + # self.test_step_outputs["contrastive_loss"].append(cl_loss) + # self.test_step_outputs["reconstruction_loss"].append(gen_loss) + # self.test_step_outputs["loss"].append(loss) + # self.test_step_outputs["decoded_outputs"].extend(decoded_outputs) + # self.test_step_outputs["decoded_targets"].extend(decoded_targets) + + batch_size = len(batch['input_a']) + self.log( + "test_cl_loss_epoch", + cl_loss, + logger=True, + on_step=False, + on_epoch=True, + sync_dist=True, + batch_size=batch_size, + ) + self.log( + "test_gen_loss_epoch", + gen_loss, + logger=True, + on_step=False, + on_epoch=True, + sync_dist=True, + batch_size=batch_size, + ) + self.log( + "test_loss_epoch", + loss, + logger=True, + on_step=False, + on_epoch=True, + sync_dist=True, + batch_size=batch_size, + ) + + if batch_idx == 0 and self.global_rank == 0: + self.log_examples( + tag="test_examples", + input_a=batch['input_a'], + input_b=batch['input_b'], + task=batch['task'], + embeddings_prediction=embeddings_sim, + embeddings_target=embeddings_pos, + ) + + return loss + + def log_examples( + self, + tag: str, + input_a: list[Float[Tensor, "seq_len"]], + input_b: list[Float[Tensor, "seq_len"]], + task: Task, + embeddings_prediction: Float[Tensor, "batch_size n_embd"], + embeddings_target: Float[Tensor, "batch_size n_embd"] | None = None, + ): + """ + Log some examples to TensorBoard + """ + text_decoder = self.model.decoder.tokenizer.create_decoder() + input_a = [ + text_decoder(seq) + for seq in input_a + ] + input_b = [ + text_decoder(seq) + for seq in input_b + ] + + embeddings = t.cat( + [embeddings_prediction, embeddings_target], + dim=0, + ) + texts = self.vec2text( + embeddings=embeddings, + ) + batch_size = len(input_a) + prediction = texts[:batch_size] + target = texts[batch_size:] + + tasks = [task.value] * batch_size + markdown_text = self.create_markdown_table( + input_a=input_a, + input_b=input_b, + tasks=tasks, + prediction=prediction, + target=target, + ) + + tensorboard = self.logger.experiment + tensorboard.add_text( + tag=tag, + text_string=markdown_text, + global_step=self.global_step, + ) + + def create_markdown_table( + self, + input_a: list[str], + input_b: list[str], + tasks: list[str], + prediction: list[str], + target: list[str], + ): + batch_size = len(input_a) + rows = [ + "| input_a | input_b | task | prediction | target |", + "| ------- | ------- | ---- | ---------- | ------ |", + ] + for i in range(batch_size): + rows.append( + f"| {input_a[i]} |" + f" {input_b[i]} |" + f" {tasks[i]} |" + f" {prediction[i]} |" + f" {target[i]} |" + ) + markdown_text = "\n".join(rows) + return markdown_text + + + # def on_test_epoch_end(self): + # rank_zero_info("Test epoch ended, calculating metrics...") + + # outputs = self.test_step_outputs + # contrastive_loss = t.stack(outputs["contrastive_loss"]).mean() + # reconstruction_loss = t.stack(outputs["reconstruction_loss"]).mean() + # loss = t.stack(outputs["loss"]).mean() + # # decoded_outputs = outputs["decoded_outputs"] + # # decoded_targets = outputs["decoded_targets"] + + # # rouge = ROUGEScore() + # # rouge_score = rouge( + # # preds=decoded_outputs, + # # target=decoded_targets, + # # ) + # # rouge1_fmeasure = rouge_score["rouge1_fmeasure"].to(device=self.device) + # # rouge2_fmeasure = rouge_score["rouge2_fmeasure"].to(device=self.device) + # # rougeL_fmeasure = rouge_score["rougeL_fmeasure"].to(device=self.device) + + # self.log( + # "test_cl_loss_epoch", + # contrastive_loss, + # logger=True, + # sync_dist=True, + # ) + # self.log( + # "test_gen_loss_epoch", + # reconstruction_loss, + # logger=True, + # sync_dist=True, + # ) + # self.log( + # "test_loss_epoch", + # loss, + # logger=True, + # sync_dist=True, + # ) + + # # self.log( + # # "test_rouge1", + # # rouge1_fmeasure, + # # logger=True, + # # sync_dist=True, + # # ) + # # self.log( + # # "test_rouge2", + # # rouge2_fmeasure, + # # logger=True, + # # sync_dist=True, + # # ) + # # self.log( + # # "test_rougeL", + # # rougeL_fmeasure, + # # logger=True, + # # sync_dist=True, + # # ) + + # self.test_step_outputs.clear() + + def configure_optimizers(self): + param_dict = { + pn: p + for pn, p + in self.named_parameters() + if p.requires_grad + } + decay_params = [ + p for _, p in param_dict.items() + if p.dim() >= 2 + ] + nodecay_params = [ + p for _, p in param_dict.items() + if p.dim() < 2 + ] + params = [ + {'params': decay_params, 'weight_decay': 0.01}, + {'params': nodecay_params, 'weight_decay': 0.0}, + ] + + optimizer = t.optim.AdamW( + params=params, + lr=self.lr, + ) + scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=self.num_warmup_steps, + num_training_steps=self.num_training_steps, + ) + return { + "optimizer": optimizer, + "lr_scheduler": { + "scheduler": scheduler, + "interval": "step", + "frequency": 1, + } + } + +# datamodule = TaskDataModule( +# data_dir=args['data_dir'], +# task=Task.ADD, +# batch_size=args['batch_size'], +# ) + +datamodule = MultiTaskDataModule( + data_dir=args['data_dir'], + tasks=[Task.ADD, Task.DIFFERENCE], + batch_size=args['batch_size'], +) + +# %% + +checkpoint_callback = ModelCheckpoint( + # dirpath=f"{args['default_root_dir']}/checkpoints", + filename="{epoch}-{step}-{val_loss_epoch:.4f}", + monitor="val_loss_epoch", + mode="min", + save_top_k=1, +) + +if args["strategy"] == "ddp": + strategy = DDPStrategy(find_unused_parameters=True) +elif args["strategy"] is not None: + strategy = args["strategy"] +else: + strategy = "auto" + +trainer = Trainer( + accelerator=args['accelerator'], + strategy=strategy, + devices=args['devices'], + precision="bf16-mixed", + callbacks=[checkpoint_callback], + max_steps=args['max_steps'], + # val_check_interval=args['max_steps'], + val_check_interval=250, + enable_checkpointing=True, + # profiler="simple", + default_root_dir=args['default_root_dir'], + # log_every_n_steps=1, +) + +model = LitModel( + lr=args['lr'], + num_warmup_steps=args['num_warmup_steps'], + num_training_steps=args['max_steps'], +) + +if args['train']: + trainer.fit( + model=model, + datamodule=datamodule, + ckpt_path=args['checkpoint_path'], + ) + +trainer.test( + model=model, + datamodule=datamodule, + ckpt_path=args['checkpoint_path'], +) + +# %% diff --git a/experiments/data_preprocess/main.sh b/experiments/data_preprocess/main.sh new file mode 100644 index 0000000..6b7a1d3 --- /dev/null +++ b/experiments/data_preprocess/main.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +CMD=" +python main.py \ + --data_dir $DATA_DIR \ + --default_root_dir $DEFAULT_ROOT_DIR \ + --accelerator gpu \ + --strategy ddp \ + --devices 2 3 \ + --batch_size 8 \ + --lr 1e-4 \ + --num_warmup_steps 1237 \ + --max_steps 123742 \ +" + +if [ -n "$TRAIN" ]; then + CMD="$CMD --train" +fi + +if [ -n "$CHECKPOINT_PATH" ]; then + CMD="$CMD --checkpoint_path $CHECKPOINT_PATH" +fi + +eval $CMD \ No newline at end of file diff --git a/notes/gen_loss.py b/notes/gen_loss.py new file mode 100644 index 0000000..c5beb82 --- /dev/null +++ b/notes/gen_loss.py @@ -0,0 +1,481 @@ +# %% [markdown] +""" +Goals: +- How to compute contrastive learning loss for text embeddings +- How to compute generation loss for text generation models + +Notes: +- The contrastive learning loss is penalizing the model for generating embeddings + that is not similar to the target embeddings and similar to other embeddings. + + However, I am not sure whether this is the right way to penalize the model. + I am not sure whether the model should be penalized for generating embeddings + that are similar to other embeddings + + Also, a cosine similarity range values are between -1 and 1, so the loss + will never be 0, even if the positive embeddings are identical and the + negative embeddings are very different. The solution is to scale the + cosine similarity by a temperature value, to exaggerate the differences + between the correct and incorrect embeddings. + + random guess loss = -ln(1/batch_size) + +- The generation loss might be miscalculated. + + The way `fairseq2.generation.BeamSearchSeq2SeqGenerator` works is not straightforward. + It computes the logits per chunk of 1, and then compute the scores per chunk. + Then, do beam search, select the token, and repeats until `max_seq_len` is reached. + Lastly, it sort the hypotheses by their scores. In other words, I am not sure + whether we can decode it in one forward pass and compute the loss in one forward pass. + + So far, reconstructing "hello world" will result in loss of 4.53, even though + the model predicts the correct token. The solution is to scale the loss + by 0.01, so that it does not dominate the contrastive learning loss. + + random guess loss = -ln(1/vocab_size) +""" + +# %% + +from sonar.inference_pipelines.text import ( + TextToEmbeddingModelPipeline, + EmbeddingToTextModelPipeline, +) + +from fairseq2.nn.padding import get_seqs_and_padding_mask +from fairseq2.data import Collater +from fairseq2.models.sequence import SequenceBatch +from fairseq2.generation import BeamSearchSeq2SeqGenerator + +import torch as t +from torch import nn +import torch.nn.functional as F + +from torchmetrics.text import ROUGEScore + +# %% TODO: remove this later + +text2vec_model = TextToEmbeddingModelPipeline( + tokenizer="text_sonar_basic_encoder", + encoder="text_sonar_basic_encoder", +) + +vec2text_model = EmbeddingToTextModelPipeline( + tokenizer="text_sonar_basic_decoder", + decoder="text_sonar_basic_decoder", +) + +# %% + +print("Reproducing text2vec_model.predict() behavior...") +max_seq_len = 512 +tokenizer_encoder = text2vec_model.tokenizer.create_encoder(lang="eng_Latn") +sentences = [ + "hello world", + "hello world", + "hello world", + # "hello", + # "hello world. my name is jeff", +] +seqs = [ + tokenizer_encoder(sentence) + for sentence in sentences +] +seqs = [ + seq[:max_seq_len] + if max_seq_len is not None + else seq + for seq in seqs +] +collater = Collater(pad_value=text2vec_model.tokenizer.vocab_info.pad_idx) +batch = collater(seqs) +tokens, padding_mask = get_seqs_and_padding_mask(data=batch) +sequence_batch = SequenceBatch(tokens, padding_mask) +embeddings = text2vec_model.model(sequence_batch).sentence_embeddings + +print(f"embeddings:\n{embeddings}") + +# %% + +print("Reproducing vec2text_model.predict() behavior...") + +target_text_encoder = vec2text_model.tokenizer.create_encoder( + task="translation", + lang="eng_Latn", + mode="target", +) +target_prefix_seqs = target_text_encoder.prefix_indices +text_decoder = vec2text_model.tokenizer.create_decoder() + +batch_size = len(embeddings) + +generator = BeamSearchSeq2SeqGenerator( + model=vec2text_model.model, + max_seq_len=max_seq_len, +) + +generator_output = generator( + embeddings, + None, + target_prefix_seqs.expand(batch_size, -1), + None +) + +texts: list[str] = [] +for idx, hypotheses in enumerate(generator_output.hypotheses): + texts.append(text_decoder(hypotheses[0].seq)) +print("texts:", texts) + +# %% + +print("Get logits") + +# One by one +input_to_decoder = [] +logits = [] +for idx, hypotheses in enumerate(generator_output.hypotheses): + # seq = t.cat( + # [ + # t.tensor([text2vec_model.tokenizer.vocab_info.eos_idx]), + # hypotheses[0].seq[:-1], + # ], + # ) + seq = t.cat( + [ + t.tensor( + [ + text2vec_model.tokenizer.vocab_info.eos_idx, + ] + ), + seqs[idx][:-1], + # seqs[idx][1:-1], # Remove first and last token (language token and EOS) + ], + ) + input_to_decoder.append(seq) + decoder_output, decoder_padding_mask = vec2text_model.model.decode( + seqs=seq.unsqueeze(0), + padding_mask=None, + encoder_output=embeddings[idx].unsqueeze(0).unsqueeze(0), + encoder_padding_mask=None, + ) + model_output = vec2text_model.model.project( + decoder_output, decoder_padding_mask + ) + logits.append(model_output.logits.squeeze(0)) + +labels = [ + seq + # seq[1:], # Remove first token (language token) + for seq in seqs + # hypotheses[0].seq + # for hypotheses in generator_output.hypotheses +] + +max_len_logits = max(l.size(0) for l in logits) +max_len_labels = max(l.size(0) for l in labels) +max_len = max(max_len_logits, max_len_labels) + +logits_padded = [] +for logit in logits: + if logit.size(0) < max_len: + pad_amt = max_len - logit.size(0) + logit = F.pad(logit, (0, 0, 0, pad_amt), value=vec2text_model.tokenizer.vocab_info.pad_idx) + logits_padded.append(logit) +logits_padded = t.stack(logits_padded, dim=0) + +labels_padded = [] +for label in labels: + if label.size(0) < max_len: + pad_amt = max_len - label.size(0) + label = F.pad(label, (0, pad_amt), value=vec2text_model.tokenizer.vocab_info.pad_idx) + labels_padded.append(label) +labels_padded = t.stack(labels_padded, dim=0) + +loss_fct = nn.CrossEntropyLoss( + ignore_index=vec2text_model.tokenizer.vocab_info.pad_idx, +) +gen_loss = loss_fct( + logits_padded.reshape(-1, logits_padded.size(-1)), # (batch_size * seq_len, vocab_size) + labels_padded.reshape(-1) # (batch_size * seq_len) +) +print("input_to_decoder:") +for seq in input_to_decoder: + print(seq) +print(f"labels_padded:\n{labels_padded}") +print("vocab_size:", logits_padded.size(-1)) +print( + "random guess loss:", + loss_fct( + t.zeros_like(logits_padded).reshape(-1, logits_padded.size(-1)), + labels_padded.reshape(-1) + ).item() +) +print("gen_loss:", gen_loss.item()) + +greedy_token = logits_padded.argmax(dim=-1) +print("greedy_token:", greedy_token) +for seq in greedy_token: + print(text_decoder(seq)) + +# %% + +# Batch +input_to_decoder = [ + t.cat([ + t.tensor( + [ + text2vec_model.tokenizer.vocab_info.eos_idx, + ] + ), + seq[:-1], # Remove last token (EOS) + # seq[1:-1], # Remove first and last token (language token and EOS) + ]) + for seq in seqs +] +input_data = collater(input_to_decoder) +input_tensor, input_padding_mask = get_seqs_and_padding_mask(data=input_data) + +decoder_output, decoder_padding_mask = vec2text_model.model.decode( + seqs=input_tensor, + padding_mask=input_padding_mask, + encoder_output=embeddings.unsqueeze(1), + encoder_padding_mask=None, +) + +model_output = vec2text_model.model.project( + decoder_output=decoder_output, + decoder_padding_mask=decoder_padding_mask, +) + +logits = model_output.logits + +labels = seqs +# labels = [ +# # seq[1:] # Remove first token (language token) +# seq +# for seq in seqs +# # hypotheses[0].seq +# # for hypotheses in generator_output.hypotheses +# ] + +max_len_logits = logits.size(1) +max_len_labels = max(l.size(0) for l in labels) +max_len = max(max_len_logits, max_len_labels) + +logits_padded = logits +if logits.size(1) < max_len: + pad_amt = max_len - logits.size(1) + logits_padded = F.pad(logits, (0, 0, 0, pad_amt), value=vec2text_model.tokenizer.vocab_info.pad_idx) + +labels_padded = [] +for label in labels: + if label.size(0) < max_len: + pad_amt = max_len - label.size(0) + label = F.pad(label, (0, pad_amt), value=vec2text_model.tokenizer.vocab_info.pad_idx) + labels_padded.append(label) +labels_padded = t.stack(labels_padded, dim=0) + +loss_fct = nn.CrossEntropyLoss( + ignore_index=vec2text_model.tokenizer.vocab_info.pad_idx, +) +gen_loss = loss_fct( + logits_padded.reshape(-1, logits_padded.size(-1)), # (batch_size * seq_len, vocab_size) + labels_padded.reshape(-1) # (batch_size * seq_len) +) +print("input_to_decoder:") +for seq in input_to_decoder: + print(seq) +print(f"labels_padded:\n{labels_padded}") +print("vocab_size:", logits_padded.size(-1)) +print( + "random guess loss:", + loss_fct( + t.zeros_like(logits_padded).reshape(-1, logits_padded.size(-1)), + labels_padded.reshape(-1) + ).item() +) +print("gen_loss:", gen_loss.item()) + +greedy_token = logits_padded.argmax(dim=-1) +print("greedy_token:", greedy_token) +for seq in greedy_token: + print(text_decoder(seq)) + +# %% + +print("Decoding with greedy search...") +batch_size = embeddings.shape[0] +input_seqs = t.tensor( + [ + [ + text2vec_model.tokenizer.vocab_info.eos_idx, + 256047, # language token + ] + for _ in range(batch_size) + ], +) + +with t.no_grad(): + for i in range(max_seq_len): + decoder_output, decoder_padding_mask = vec2text_model.model.decode( + seqs=input_seqs, + padding_mask=None, + encoder_output=embeddings.unsqueeze(1), + encoder_padding_mask=None, + ) + greedy_token = ( + vec2text_model.model.project( + decoder_output=decoder_output, + decoder_padding_mask=decoder_padding_mask, + ) + .logits[:, -1, :] + .argmax(dim=-1, keepdim=True) + ) + input_seqs = t.cat( + [input_seqs, greedy_token], + dim=1 + ) + if ( + greedy_token[-1] == text2vec_model.tokenizer.vocab_info.eos_idx + ): + break + +print(f"seqs:\n{input_seqs}") + +for seq in input_seqs: + print(text_decoder(seq)) + +# %% + +input_a = [ + "hello world", + "lorem ipsum", +] +input_b = [ + "world", + "ipsum", +] +input_target = [ + "hello", + "ipsum", +] +# input_target = None + +batch_size = len(input_a) + +input = input_a + input_b + input_target + +embeddings = text2vec_model.predict( + input=input, + source_lang="eng_Latn", +) +print("embeddings shape:", embeddings.shape) + +embeddings = embeddings.clone() +# norm1 = nn.RMSNorm(1024) +# embeddings = norm1(embeddings) + +embeddings_a = embeddings[0:batch_size] +embeddings_b = embeddings[batch_size:2*batch_size] +embeddings_target = embeddings[2*batch_size:] + +print("embeddings_a shape:", embeddings_a.shape) +print("embeddings_b shape:", embeddings_b.shape) + +embeddings_a_b = t.cat([embeddings_a, embeddings_b], dim=1) +print("embeddings_a_b shape:", embeddings_a_b.shape) + +linear = nn.Linear(2*1024, 1024) +embeddings_sim = linear(embeddings_a_b) # shape: (batch_size, 1024) +# norm2 = nn.RMSNorm(1024) +# embeddings_sim = norm2(embeddings_sim) +print("embeddings_sim shape:", embeddings_sim.shape) + +embeddings_pos = embeddings_target + +texts = vec2text_model.predict( + inputs=embeddings_sim, + target_lang="eng_Latn", + max_seq_len=10, +) +print("texts:", texts) + +if input_target is not None: + sim_fct = nn.CosineSimilarity(dim=-1) + temperature = 0.05 + cos_sim = sim_fct( + embeddings_sim.unsqueeze(dim=1), # shape: (batch_size, 1, n_embd) + embeddings_pos.unsqueeze(dim=0), # shape: (1, batch_size, n_embd) + ) / temperature + # cos_sim = t.tensor([ + # [1.0, -1.0], + # [-1.0, 1.0], + # ]) + print(f"cos_sim:\n{cos_sim}") + cl_labels = t.arange(batch_size, device=embeddings_sim.device) + loss_fct = nn.CrossEntropyLoss() + cl_loss = loss_fct(cos_sim, cl_labels) + print("cl_loss:", cl_loss.item()) + +# %% + +input = [ + "hello world", + "world is a beautiful place", + "my name is jeff", + "name is a way to identify a person", + "is that true", + "jeff is the CEO of the company", + "and he is a good person", + "you can trust him", +] + +embeddings = text2vec_model.predict( + input=input, + source_lang="eng_Latn", +) + +sim_fct = nn.CosineSimilarity(dim=-1) +temperature = 0.05 +cos_sim = sim_fct( + embeddings.unsqueeze(dim=1), # shape: (batch_size, 1, n_embd) + embeddings.unsqueeze(dim=0), # shape: (1, batch_size, n_embd) +) / temperature +print(f"cos_sim:\n{cos_sim}") +loss_fct = nn.CrossEntropyLoss() +cl_labels = t.arange(len(input), device=embeddings.device) +cl_loss = loss_fct(cos_sim, cl_labels) +print("cl_loss:", cl_loss.item()) + +# %% + +preds = [ + "hello world", + "hello", +] +target = [ + "hello world", + "hello ?", +] + +print(f"preds: {preds}") +print(f"target: {target}") + +rouge = ROUGEScore(accumulate='avg') +rouge_score = rouge(preds, target) +rogue1_fmeasure = rouge_score["rouge1_fmeasure"] +rogue2_fmeasure = rouge_score["rouge2_fmeasure"] +rougeL_fmeasure = rouge_score["rougeL_fmeasure"] +rougeLsum_fmeasure = rouge_score["rougeLsum_fmeasure"] + +print(f"rouge1_fmeasure: {rogue1_fmeasure:.4f}") +# unigrams, rogue1_fmeasure=1.0 because the target unigrams are 3, +# and it matched 3 times in preds +print(f"rouge2_fmeasure: {rogue2_fmeasure:.4f}") +# bigrams +print(f"rougeL_fmeasure: {rougeL_fmeasure:.4f}") +# longest common subsequence +print(f"rougeLsum_fmeasure: {rougeLsum_fmeasure:.4f}") +# LCS over concatenated text + +# %% diff --git a/notes/naive_difference_operator.py b/notes/naive_difference_operator.py new file mode 100644 index 0000000..e4f3dfb --- /dev/null +++ b/notes/naive_difference_operator.py @@ -0,0 +1,428 @@ +# %% [markdown] +""" +1. `'Capital of' Vector = Embedding Relation - Embedding Control` does not work +1. `Inversion Vector = Embedding Passive - Embedding Active` does not work +2. `Relational Vector = Embedding Sentence - Embedding Cause - Embedding Effect` does not work +3. `Politeness Vector = Embedding Polite - Embedding Impolite` works for seen sentences, but not for unseen sentences +""" + +# %% + +import os +import sys + +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +if project_root not in sys.path: + print(f"Adding project root to sys.path: {project_root}") + sys.path.append(project_root) + +# %% + +from argparse import ArgumentParser +from typing import TypedDict + +import torch as t +from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline +from sonar.inference_pipelines.text import EmbeddingToTextModelPipeline + +from sklearn.decomposition import PCA +import plotly.graph_objects as go + +# %% + +class Args(TypedDict): + workspace_path: str + device: str + +def parse_args(): + parser = ArgumentParser(description="Run interpretable relational experiments.") + + parser.add_argument( + "--workspace_path", + type=str, + help="Path to the workspace directory where the experiments will be run.", + ) + + parser.add_argument( + "--device", + type=str, + help="Device to run the model on (e.g., 'cuda:0' for GPU, 'cpu' for CPU).", + ) + + ns = parser.parse_args() + return Args(**vars(ns)) + +if False: + WORKSPACE_PATH = "/workspace/ALGOVERSE/UJR/jason" + sys.argv = [ + 'main.py', + '--workspace_path', WORKSPACE_PATH, + '--device', 'cuda:1', + ] + +args = parse_args() + +print("Parsed arguments:") +for key, value in args.items(): + print(f"{key}: {value}") + +# %% + +print("Loading text-to-embedding and embedding-to-text models...") +text2vec_model = TextToEmbeddingModelPipeline( + encoder="text_sonar_basic_encoder", + tokenizer="text_sonar_basic_encoder", + device=t.device(args["device"]), +) + +vec2text_model = EmbeddingToTextModelPipeline( + decoder="text_sonar_basic_decoder", + tokenizer="text_sonar_basic_decoder", + device=t.device(args["device"]), +) + +# %% [markdown] +""" +$$\vec{v}_{\text{capital\_of}} = S_{\text{relation}} - S_{\text{control}}$$ + +Conclusion: It doesn't work +""" + +sentences = [ + ("Paris is the capital of France.", # S_relation + "Paris and France are both in Europe.", # S_control + "Berlin is a large city in Germany."), # S_target + ("Berlin is the capital of Germany.", + "Berlin and Germany are both in Europe.", + "Paris is a large city in France."), + ("Madrid is the capital of Spain.", + "Madrid and Spain are both in Europe.", + "Lisbon is a large city in Portugal."), + ("Rome is the capital of Italy.", + "Rome and Italy are both in Europe.", + "Athens is a large city in Greece."), + ("London is the capital of the United Kingdom.", + "London and the United Kingdom are both in Europe.", + "Dublin is a large city in Ireland."), +] + +embeddings = text2vec_model.predict( + [sentence for tup in sentences for sentence in tup], + source_lang="eng_Latn", +) + +v_capital_of = t.stack([ + embeddings[i * 3 + 0] - embeddings[i * 3 + 1] + for i in range(len(sentences)) +]).mean(dim=0) + +intervened_embeddings = t.stack([ + embeddings[i * 3 + 2] + v_capital_of + for i in range(len(sentences)) +]) + +decoded_v_capital_of = vec2text_model.predict( + [v_capital_of], + target_lang="eng_Latn", +) + +decoded = vec2text_model.predict( + intervened_embeddings, + target_lang="eng_Latn", +) + +print(f"S_decoded_v_capital_of: {decoded_v_capital_of[0]}") + +for i, tup in enumerate(sentences): + print(f"S_relation_{i}: {tup[0]}") + print(f"S_control_{i}: {tup[1]}") + print(f"S_target_{i}: {tup[2]}") + print(f"S_decoded_{i}: {decoded[i]}") + print() + +# %% [markdown] +""" +$$\vec{v}_\text{inversion} = S_{\text{passive}} - S_{\text{active}}$$ + +Conclusion: It does not work +""" + +sentences = [ + ("The heavy rain caused flooding", + "The flooding was caused by the heavy rain"), + ("The dog chased the cat.", + "The cat was chased by the dog."), + ("The teacher praised the student.", + "The student was praised by the teacher."), + ("The chef cooked a delicious meal.", + "A delicious meal was cooked by the chef."), + ("The artist painted a beautiful picture.", + "A beautiful picture was painted by the artist."), + ("The scientist discovered a new species.", + "A new species was discovered by the scientist."), +] + +embeddings = text2vec_model.predict( + [sentence for tup in sentences for sentence in tup], + source_lang="eng_Latn", +) + +v_inversion = t.stack([ + embeddings[i * 2 + 1] - embeddings[i * 2 + 0] + for i in range(len(sentences)) +]).mean(dim=0) + +intervened_embeddings = t.stack([ + embeddings[i * 2 + 0] + v_inversion + for i in range(len(sentences)) +]) + +decoded_v_inversion = vec2text_model.predict( + [v_inversion], + target_lang="eng_Latn", +) + +decoded = vec2text_model.predict( + intervened_embeddings, + target_lang="eng_Latn", +) + +print(f"S_decoded_v_inversion: {decoded_v_inversion[0]}") +for i, tup in enumerate(sentences): + print(f"S_active_{i}: {tup[0]}") + print(f"S_passive_{i}: {tup[1]}") + print(f"S_decoded_{i}: {decoded[i]}") + print() + +# %% [markdown] +""" +$$\vec{v}_{\text{relational}_i} = z_{\text{sentence}_i} - z_{\text{cause}_i} - z_{\text{effect}_i}$$ + +$$\text{alignment\_score} = \text{cosine\_similarity}(\vec{v}_\text{claimed\_relation},\vec{v}_\text{relational})$$ + +Conclusion: It does not work +""" + +sentences = [ + ("Heavy rain caused flooding", # S_sentence + "Heavy rain _ _", # S_cause + "_ _ _ flooding"), # S_effect + ("Heavy rain resulted in flooding", + "Heavy rain _ _ _", + "_ _ _ flooding"), + ("Heavy rain led to flooding", + "Heavy rain _ _ _", + "_ _ _ _ flooding"), + ("Smoking increases the risk of cancer", + "Smoking _ _ _ _ _", + "_ the risk of cancer"), + ("Smoking is a major cause of cancer", + "Smoking _ _ _ _ _ _", + "_ _ _ _ _ _ cancer"), + ("Smoking contributes to the development of cancer", + "Smoking _ _ _ _ _ _", + "_ _ _ the development of cancer"), +] + +# sentences = [ +# tuple(s.replace("_", "").strip() for s in tup) +# for tup in sentences +# ] + +claimed_sentences = [ + ("Vaccines caused a significant reduction in disease incidence", # S_sentence + "Vaccines _ _ _ _ _ _ _", # S_cause + "_ _ a significant reduction in disease incidence", # S_effect + "OK"), + ("Vaccines cause autism", + "Vaccines _ _", + "_ _ autism", + "Misleading claim"), + ("Vaccines lead to autism", + "Vaccines _ _ _", + "_ _ _ autism", + "Misleading claim"), + ("Vaccines save lives", + "Vaccines _ _", + "_ _ lives", + "OK"), + ("Vaccines prevent disease", + "Vaccines _ _", + "_ _ disease", + "OK"), + ("Vaccines protect against infections", + "Vaccines _ _ _", + "_ _ against infections", + "OK"), + ("Vacciness cause autism. This is a fact", + "Vaccines _ _. _ _ _ _", + "_ _ autism. _ _ _ _ _", + "Misleading claim"), + ("Vaccines are safe and effective", + "Vaccines _ _ _ _", + "_ _ _ _ effective", + "OK"), + ("Global warming is caused by solar activity", + "Global warming _ _ _ _ _", + "_ _ _ _ _ solar activity", + "Misleading claim"), +] + +# claimed_sentences = [ +# tuple(s.replace("_", "").strip() for s in tup) +# for tup in claimed_sentences +# ] + +embeddings = text2vec_model.predict( + [sentence for tup in sentences for sentence in tup], + source_lang="eng_Latn", +) + +claimed_embeddings = text2vec_model.predict( + [sentence for tup in claimed_sentences for sentence in tup], + source_lang="eng_Latn", +) + +v_relational = t.stack([ + embeddings[i * 3 + 0] - embeddings[i * 3 + 1] - embeddings[i * 3 + 2] + for i in range(len(sentences)) +]) + +v_claimed_relational = t.stack([ + claimed_embeddings[i * 3 + 0] - claimed_embeddings[i * 3 + 1] - claimed_embeddings[i * 3 + 2] + for i in range(len(claimed_sentences)) +]) + +alignment_score = t.cosine_similarity( + v_relational.mean(dim=0), + v_claimed_relational, +) + +for i, tup in enumerate(claimed_sentences): + print(f"S_claimed_sentence_{i}: {tup[0]}") + print(f"alignment_score_{i}: {alignment_score[i].item()}") + print(f"norm_{i}: {v_claimed_relational[i].norm().item():.4f}") + +pca = PCA(n_components=2) +v_relational_2d = pca.fit_transform( + v_relational.cpu().numpy(), +) + +print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}") + +v_claimed_relational_2d = pca.transform( + v_claimed_relational.cpu().numpy(), +) + +fig = go.Figure() +fig.add_trace(go.Scatter( + x=v_relational_2d[:, 0], + y=v_relational_2d[:, 1], + mode='markers', + name="Relational", + marker=dict(color="blue", opacity=0.5), + text=[sentences[i][0] for i in range(len(v_relational_2d))], +)) +claimed_labels = [tup[3] for tup in claimed_sentences] +ok_mask = [label == "OK" for label in claimed_labels] +misleading_mask = [label == "Misleading claim" for label in claimed_labels] +fig.add_trace(go.Scatter( + x=v_claimed_relational_2d[ok_mask, 0], + y=v_claimed_relational_2d[ok_mask, 1], + mode='markers', + name='Claimed OK', + marker=dict(color='green', opacity=0.7), + text=[claimed_sentences[i][0] for i, ok in enumerate(ok_mask) if ok] +)) +fig.add_trace(go.Scatter( + x=v_claimed_relational_2d[misleading_mask, 0], + y=v_claimed_relational_2d[misleading_mask, 1], + mode='markers', + name='Claimed Misleading', + marker=dict(color='red', opacity=0.7), + text=[claimed_sentences[i][0] for i, ms in enumerate(misleading_mask) if ms] +)) + +fig.update_layout( + title="PCA of Relational and Claimed Relational Vectors", + xaxis_title="PCA Component 1", + yaxis_title="PCA Component 2", + legend=dict(title="Legend"), +) + +# %% [markdown] +""" +$$\vec{v}_\text{politeness} = S_{\text{polite}} - S_{\text{impolite}}$$ + +Conclusion: It works for seen sentences, but not for unseen sentences +""" + +sentences = [ + ("Could you help me?", "Help me!"), + ("Can you assist me?", "Assist me!"), + ("Would you be able to help me?", "Help me!"), + ("I need your assistance.", "Need your assistance!"), + ("Please help me with this task.", "Help me with this task!"), +] + +unseen_sentences = [ + ("Look at this!", "Impolite"), + ("Pay attention to this!", "Impolite"), + ("Just do it!", "Impolite"), + ("I need you to do this!", "Impolite"), + ("Can you do this for me?", "Polite"), + ("I would appreciate your help with this.", "Polite"), +] + +embeddings = text2vec_model.predict( + [sentence for tup in sentences for sentence in tup], + source_lang="eng_Latn", +) + +unseen_embeddings = text2vec_model.predict( + [sentence for tup in unseen_sentences for sentence in tup], + source_lang="eng_Latn", +) + +v_politeness = t.stack([ + embeddings[i * 2 + 0] - embeddings[i * 2 + 1] + for i in range(len(sentences)) +]).mean(dim=0) + +intervened_embeddings = t.stack([ + embeddings[i * 2 + 1] + v_politeness + for i in range(len(sentences)) +]) + +intervened_unseen_embeddings = t.stack([ + unseen_embeddings[i * 2 + 1] + v_politeness + for i in range(len(unseen_sentences)) +]) + +decoded_v_politeness = vec2text_model.predict( + [v_politeness], + target_lang="eng_Latn", +) +decoded = vec2text_model.predict( + intervened_embeddings, + target_lang="eng_Latn", +) +decoded_unseen = vec2text_model.predict( + intervened_unseen_embeddings, + target_lang="eng_Latn", +) + +print(f"S_decoded_v_politeness: {decoded_v_politeness[0]}") +for i, tup in enumerate(sentences): + print(f"S_polite_{i}: {tup[0]}") + print(f"S_impolite_{i}: {tup[1]}") + print(f"S_decoded_{i}: {decoded[i]}") + print() + +for i, tup in enumerate(unseen_sentences): + print(f"S_unseen_{i}: {tup[0]}") + print(f"S_unseen_label_{i}: {tup[1]}") + print(f"S_decoded_unseen_{i}: {decoded_unseen[i]}") + print() + +# %% diff --git a/notes/pca.py b/notes/pca.py new file mode 100644 index 0000000..ae840a6 --- /dev/null +++ b/notes/pca.py @@ -0,0 +1,228 @@ +# %% [markdown] + +""" +Personal note: +1. What is PCA? + Principal Component Analysis (PCA) is a linear + dimensionality reduction technique. It transforms + a dataset with possibly correlated features into + a set of linearly uncorrelated variables called + principal components. The first principal + component captures the largest possible variance, + the second the next largest (orthogonal to the + first), and so on. PCA is widely used for + visualization, noise reduction and feature + extraction +""" + +# %% +# Example: PCA from 3D to 2D (Manual and +# scikit-learn) + +# Manual PCA (without scikit-learn) +import numpy as np + +# %% + +# Example 3D data (each row is a sample) +X = np.array([ + [2.5, 2.4, 0.5], + [0.5, 0.7, 0.2], + [2.2, 2.9, 0.7], + [1.9, 2.2, 0.3], + [3.1, 3.0, 0.9], + [2.3, 2.7, 0.6], + [2.0, 1.6, 0.4], + [1.0, 1.1, 0.1], + [1.5, 1.6, 0.2], + [1.1, 0.9, 0.1] +]) + +# %% +""" +Personal note: +1. Why do we center the data? + We center the data (subtract the mean of each + feature/column) so that each feature has a mean + of zero. + This is important because PCA finds directions + of maximum variance from the origin + If the data is not centered, the first principal + component may simply point toward the mean of + the data, not the true direction of maximum + variance +""" + +# 1. Center the data +X_mean = np.mean(X, axis=0) # mean of each column +X_meaned = X - X_mean + +print(f"X_mean:\n{X_mean}") +print(f"X_meaned:\n{X_meaned}") + +# %% +""" +Personal note: +2. How is the covariance matrix computed? + Example without `np.cov` + The covariance matrix measures how much + each pair of features varies together. + For data matrix `X` (shape: samples x features), + the covariance between feature `i` and `j` is: + + Cov(i, j) = ( + (1 / (n - 1)) + * \sum_{k=1}^n (X_{k,i} - \bar{X}_i) * (X_{k,j} - \bar{X}_j) + ) +""" + +# Example: Compute covariance matrix manually +n_samples = X_meaned.shape[0] +cov_manual = (X_meaned.T @ X_meaned) / (n_samples - 1) + +# 2. Compute covariance matrix +cov_mat = np.cov(X_meaned, rowvar=False) + +print(f"Manual covariance matrix:\n{cov_manual}") +print(f"Covariance matrix:\n{cov_mat}") + +# %% + +""" +Personal note: +1. What are eigenvalues and eigenvectors? + - Eigenvectors of a square matrix (A) are a special + nonzero vectors (v) such that multiplying (A) + by (v) only stretches or shrinks (v), + not changing its direction: [A v = \lambda v], + where (\lambda) is a scalar called eigenvalue + corresponding to eigenvector (v). + - In PCA, the eigenvectors of the covariance + matrix point in the directions of maximum + variance (principal components), and the + eigenvalues tell you how much variance is in + those directions +2. How are eigenvalues and eigenvectors computed? + Example without `np.linalg.eigh` + Theory + - For a square matrix (A), eigenvalues (\lambda) + satisfy: [ \det(A - \lambda I) = 0]. This + is called the characteristic equation + - For each eigenvalue, the corresponding + eigenvector (v) solves: [ (A - \lambda I)v = 0], +""" + +# 3. Comptue eigenvalues and eigenvectors +eig_vals, eig_vecs = np.linalg.eigh(cov_mat) + +print(f"eigenvalues:\n{eig_vals}") +print(f"eigenvectors:\n{eig_vecs}") + +# %% + +# 4. Sort eigenvectors by eigenvalues (descending) +sorted_idx = np.argsort(eig_vals)[::-1] +eig_vals = eig_vals[sorted_idx] +eig_vecs = eig_vecs[:, sorted_idx] + +print("Sorted eigenvalues:\n", eig_vals) +print("Sorted eigenvectors:\n", eig_vecs) + +# %% + +# 5. Select top 2 eigenvectors (for 2D) +eig_vecs_2d = eig_vecs[:, :2] + +""" +Personal note: +1. Why we select top 2 eigenvectors based on the + eigen values? Why we want to use the eigenvector + which has the largest eigen values? + + We select the top 2 eigenvectors based on the + largest eigenvalues because: + - Each eigenvector of the covariance matrix points + in a direction in feature space + - Each eigenvalue tells us how much variance (spread + of the data) there is along its corresponding + eigenvector + + Why use the largest eigenvalues? + - The eigenvectors with the largest eigenvalues + capture the most variance in the data + - By projecting onto these directions, we retain + as much information (variance) as possible + in fewer dimensions + - This is the core idea of PCA: reduce + dimensionality while preserving the most + important structure in the data + + Summary: + We use the eigenvectors with the largest eigenvalues + because they represent the directions + where the data varies the most, which is + what we want to keep in a lower-dimensional + representation +""" + +# %% + +# 6. Project data onto new 2D space +X_pca_manual = np.dot(X_meaned, eig_vecs_2d) +print("Manual PCA result (first 2D points):\n", X_pca_manual[:3]) + +# %% +# scikit-learn PCA (for validation) + +from sklearn.decomposition import PCA + +pca = PCA(n_components=2) +X_pca_sklearn = pca.fit_transform(X) + +print("sklearn PCA result (first 2D points):\n", X_pca_sklearn[:3]) + +# %% + +explained_variance_ratio_manual = eig_vals / np.sum(eig_vals) + +print("Manual PCA explained variance ratio:", explained_variance_ratio_manual) +print("PCA explained variance ratio:", pca.explained_variance_ratio_) + +""" +Personal note: +3. What is PCA Explained Variance Ratio? + Is [0.1, 0.017] good? + - Explained Variance Ratio: For each principal + component, this value shows the proportion of + the dataset's total variance captured by that + component. For example, `[0.1, 0.017]` + means the first component explains 10% of the + variance, the second 1.7% + + Is `0.1, 0.017` good? + - No, it's not good for visualization or + interpretation. + - Together, the first two components explain + only 11.7% of the total variance. + - This means that a 2D plot will not capture + most of the structure in your data + - The plot may be misleading: clusters or + patterns you see may not reflect the + true relationships in the high-dimensional + space. + + Rule of thumb: For PCA plots to be meaningful, + you typically want the first two components to + explain a substantial portion of the variance + (e.g., >50%). If not, be cautious in interpreting + the plot + + Summary: + - PCA finds the directions of maximum variance. + - You can do PCA manually with numpy (see above) + - Explained variance ratio tells you how much of + the data's structure is captured + - `[0.1, 0.017]` is low; the plot may not be + representative of the real data structure +""" +# %% diff --git a/notes/pca_steering.py b/notes/pca_steering.py new file mode 100644 index 0000000..95ac0da --- /dev/null +++ b/notes/pca_steering.py @@ -0,0 +1,330 @@ +# %% + +print("Importing libraries...") +import numpy as np +import plotly.graph_objs as go +from sklearn.decomposition import PCA + +# %% + +print("Preparing data for PCA...") +original = np.array([ + [1, 1, 1], + [1, 2, 1], + [2, 1, 1], + [2, 2, 1], + [1, 1, 2], + [1, 2, 2], + [2, 1, 2], + [2, 2, 2] +]) +cluster1 = original * 1 - [[1, 8, 1], + [1, 7, 2], + [1, 6, 3], + [1, 5, 4], + [1, 4, 5], + [1, 3, 6], + [1, 2, 7], + [1, 1, 8]] +cluster2 = original * 3 +print(f"cluster1:\n{cluster1}") +print(f"cluster2:\n{cluster2}") +X = np.vstack([cluster1, cluster2]) + +# %% + +print("Plotting original 3D data...") +fig = go.Figure() +fig.add_trace(go.Scatter3d( + x=cluster1[:, 0], + y=cluster1[:, 1], + z=cluster1[:, 2], + mode='markers', + marker=dict(size=6, color='blue'), + name='cluster1' +)) +fig.add_trace(go.Scatter3d( + x=cluster2[:, 0], + y=cluster2[:, 1], + z=cluster2[:, 2], + mode='markers', + marker=dict(size=6, color='red'), + name='cluster2' +)) +fig.update_layout( + scene=dict( + xaxis_title='X', + yaxis_title='Y', + zaxis_title='Z' + ), + margin=dict(l=0, r=0, b=0, t=0) +) +fig.show() + +# %% + +print("Performing PCA to reduce from 3D to 2D...") +pca = PCA(n_components=2) +X_2d = pca.fit_transform(X) + +# %% + +print("Plotting PCA result in 2D...") +fig = go.Figure() +fig.add_trace(go.Scatter( + x=X_2d[:8, 0], + y=X_2d[:8, 1], + mode='markers', + marker=dict(size=6, color='blue'), + name='cluster1' +)) +fig.add_trace(go.Scatter( + x=X_2d[8:, 0], + y=X_2d[8:, 1], + mode='markers', + marker=dict(size=6, color='red'), + name='cluster2' +)) +fig.update_layout( + xaxis_title='PC1', + yaxis_title='PC2', + margin=dict(l=0, r=0, b=0, t=0) +) +fig.show() + +# %% + +point_idx = 1 +scaling = 1.0 + +print("Shifting a point towards the direction of cluster2...") + +original_point = X[point_idx] + +cluster1_mean = cluster1.mean(axis=0) +cluster2_mean = cluster2.mean(axis=0) +direction = cluster2_mean - cluster1_mean +direction_unit = direction / np.linalg.norm(direction) + +point_to_cluster2_mean = cluster2_mean - original_point +distance_to_cluster2 = np.dot(direction_unit, point_to_cluster2_mean) +adaptive_shift = distance_to_cluster2 * direction_unit + +# equivalent +# proj = np.dot(direction_unit, point_to_cluster2_mean) +# total_distance = np.linalg.norm(direction) +# adaptive_shift = proj / total_distance * direction + +shifted_point = original_point + scaling * adaptive_shift + +original_2d = pca.transform([original_point])[0] +shifted_2d = pca.transform([shifted_point])[0] + +print("Plot original and shifted point with an arrow") +fig = go.Figure() + +# Plot cluster1 and cluster2 as before +fig.add_trace(go.Scatter3d( + x=cluster1[:, 0], + y=cluster1[:, 1], + z=cluster1[:, 2], + mode='markers', + marker=dict(size=6, color='blue'), + name='cluster1', + text=[f"id: {i}" for i in range(len(cluster1))] # Add id as customdata +)) +fig.add_trace(go.Scatter3d( + x=cluster2[:, 0], + y=cluster2[:, 1], + z=cluster2[:, 2], + mode='markers', + marker=dict(size=6, color='red'), + name='cluster2', + text=[f"id: {i+8}" for i in range(len(cluster2))] # Add id as customdata +)) + +# Plot the cluster1 and cluster2 means +fig.add_trace(go.Scatter3d( + x=[cluster1_mean[0]], + y=[cluster1_mean[1]], + z=[cluster1_mean[2]], + mode='markers', + marker=dict(size=6, color='blue', symbol='circle-open'), + name='cluster1 mean' +)) +fig.add_trace(go.Scatter3d( + x=[cluster2_mean[0]], + y=[cluster2_mean[1]], + z=[cluster2_mean[2]], + mode='markers', + marker=dict(size=6, color='red', symbol='circle-open'), + name='cluster2 mean', +)) + +# Plot the direction unit as axis +fig.add_trace(go.Scatter3d( + x=[cluster1_mean[0] - direction_unit[0], cluster2_mean[0] + direction_unit[0]], + y=[cluster1_mean[1] - direction_unit[1], cluster2_mean[1] + direction_unit[1]], + z=[cluster1_mean[2] - direction_unit[2], cluster2_mean[2] + direction_unit[2]], + mode='lines+markers', + marker=dict(size=2, color='black'), + line=dict(color='black', width=4, dash='dash'), + name='direction unit axis' +)) + +# Plot the point_to_cluster2_mean vector +fig.add_trace(go.Scatter3d( + x=[original_point[0], cluster2_mean[0]], + y=[original_point[1], cluster2_mean[1]], + z=[original_point[2], cluster2_mean[2]], + mode='lines+markers', + marker=dict(size=2, color='purple'), + line=dict(color='purple', width=6, dash='dash'), + name='point_to_cluster2_mean', +)) + +# Plot the shifted point +fig.add_trace(go.Scatter3d( + x=[shifted_point[0]], + y=[shifted_point[1]], + z=[shifted_point[2]], + mode='markers', + marker=dict(size=6, color='orange', symbol='diamond'), + name='shifted_point', +)) + +# Add an arrow from original_point to shifted_point +fig.add_trace(go.Scatter3d( + x=[original_point[0], shifted_point[0]], + y=[original_point[1], shifted_point[1]], + z=[original_point[2], shifted_point[2]], + mode='lines+markers', + marker=dict(size=2, color='black'), + line=dict(color='black', width=6, dash='dash'), + name='shift_vector' +)) + +# Define and plot the dividing plane +# Create a meshgrid for the plane +xx, yy = np.meshgrid( + np.linspace(cluster2_mean[0] - 1, cluster2_mean[0] + 1, 10), + np.linspace(cluster2_mean[1] - 1, cluster2_mean[1] + 1, 10) +) +# Reference: https://www.youtube.com/watch?v=2sZKZHyaQJ8 +# Equation of the plane given a point +# and perpendicular normal vector: +# a(x-x0) + b(y-y0) + c(z-z0) = 0 +# where (a,b,c) is the normal vector (direction_unit) +# and (x0,y0,z0) is a point on the plane (cluster2_mean) +# We solve for z: z = z0 - (a(x-x0) + b(y-y0)) / c +a, b, c = direction_unit +x0, y0, z0 = cluster2_mean +zz = z0 - (a * (xx - x0) + b * (yy - y0)) / c + +fig.add_trace(go.Surface( + x=xx, y=yy, z=zz, + colorscale='Greys', + opacity=0.5, + showscale=False, + name='Dividing Plane' +)) + +fig.update_layout( + scene=dict( + xaxis_title='X', + yaxis_title='Y', + zaxis_title='Z', + aspectmode='data' + ), + margin=dict(l=0, r=0, b=0, t=0) +) +fig.show() + +# %% + +print("Plot PCA result in 2D with shifted point...") +fig = go.Figure() + +# Plot cluster1 and cluster2 in 2D +fig.add_trace(go.Scatter( + x=X_2d[:8, 0], + y=X_2d[:8, 1], + mode='markers', + marker=dict(size=6, color='blue'), + name='cluster1' +)) +fig.add_trace(go.Scatter( + x=X_2d[8:, 0], + y=X_2d[8:, 1], + mode='markers', + marker=dict(size=6, color='red'), + name='cluster2' +)) + +# Plot the cluster1 and cluster2 means in 2D +cluster1_mean_2d = pca.transform([cluster1_mean])[0] +cluster2_mean_2d = pca.transform([cluster2_mean])[0] +fig.add_trace(go.Scatter( + x=[cluster1_mean_2d[0]], + y=[cluster1_mean_2d[1]], + mode='markers', + marker=dict(size=6, color='blue', symbol='circle-open'), + name='cluster1 mean' +)) +fig.add_trace(go.Scatter( + x=[cluster2_mean_2d[0]], + y=[cluster2_mean_2d[1]], + mode='markers', + marker=dict(size=6, color='red', symbol='circle-open'), + name='cluster2 mean' +)) + +# Plot the direction unit as axis in 2D +direction_unit_2d = direction_unit @ pca.components_.T +direction_unit_2d = direction_unit_2d / np.linalg.norm(direction_unit_2d) +fig.add_trace(go.Scatter( + x=[cluster2_mean_2d[0] - direction_unit_2d[0], cluster2_mean_2d[0] + direction_unit_2d[0]], + y=[cluster2_mean_2d[1] - direction_unit_2d[1], cluster2_mean_2d[1] + direction_unit_2d[1]], + mode='lines+markers', + marker=dict(size=2, color='black'), + line=dict(color='black', width=2, dash='dash'), + name='direction unit axis' +)) + +# plot the point_to_cluster2_mean vector in 2D +fig.add_trace(go.Scatter( + x=[original_2d[0], cluster2_mean_2d[0]], + y=[original_2d[1], cluster2_mean_2d[1]], + mode='lines+markers', + marker=dict(size=2, color='purple'), + line=dict(color='purple', width=2, dash='dash'), + name='point_to_cluster2_mean', +)) + +# Plot the shifted point +fig.add_trace(go.Scatter( + x=[shifted_2d[0]], + y=[shifted_2d[1]], + mode='markers', + marker=dict(size=6, color='orange', symbol='diamond'), + name='shifted_point' +)) + +# Add an arrow from original_pont to shifted_point +fig.add_trace(go.Scatter( + x=[original_2d[0], shifted_2d[0]], + y=[original_2d[1], shifted_2d[1]], + mode='lines+markers', + marker=dict(size=2, color='black'), + line=dict(color='black', width=2, dash='dash'), + name='shift_vector' +)) + +fig.update_layout( + xaxis_title='PC1', + yaxis_title='PC2', + margin=dict(l=0, r=0, b=0, t=0) +) +fig.show() + +# %% \ No newline at end of file diff --git a/notes/umap.py b/notes/umap.py new file mode 100644 index 0000000..fe60f8e --- /dev/null +++ b/notes/umap.py @@ -0,0 +1,153 @@ +# %% + +""" +Personal note: +1. What is UMAP? + + UMAP (Uniform Manifold Approximation and Projection) + is a nonlinear dimensionality reduction technique. + It aims to preserve both local and some global + structure of high-dimensional data + when projecting it to a lower-dimensional space + (e.g., 2D for visualization). UMAP constructs + a weighted graph representing the data's manifold + structure, then optimizes a low-dimensional + embedding to preserve this structure as much as + possible. It is widely used for visualization and clusteirng, + and is generally faster and more scalable than t-SNE. + +2. Example: UMAP by Hand (3D to 2D) and Validation + + UMAP is complex and cannot be fully implemented + in a few lines (it involves fuzzy simplicial sets, + neighbor graphs, and stochastic optimization). + However, you can mimic the spirit of UMAP using + simple neighbor-preserving projection, then compare + with the real UMAP. + + Step 1: "Manual" Neighbor-pPreserving Projection + (for illustration) + + Suppose you have 3D points in a cluster and + want to project to 2D, preserving nearest neighbors: +""" + +import numpy as np +import matplotlib.pyplot as plt +import umap + +# Generate synthetic 3D data (e.g., a noisy spiral) +np.random.seed(42) +t = np.linspace(0, 4 * np.pi, 200) +x = np.sin(t) +y = np.cos(t) +z = t + 0.1 * np.random.randn(200) +data_3d = np.vstack([x, y, z]).T + +# "Manual" projection: project onto the plane defined by the first two principal axes (like PCA) +from sklearn.decomposition import PCA +pca = PCA(n_components=2) +manual_2d = pca.fit_transform(data_3d) + +print(f"PCA Explained Variance Ratio: {pca.explained_variance_ratio_}") + +# UMAP projection +umap_2d = umap.UMAP(n_components=2, random_state=42).fit_transform(data_3d) + +# Plot both +fig, axs = plt.subplots(1, 2, figsize=(10, 4)) +axs[0].scatter(manual_2d[:, 0], manual_2d[:, 1], c=t, cmap='viridis') +axs[0].set_title('Manual (PCA-like) 2D Projection') +axs[1].scatter(umap_2d[:, 0], umap_2d[:, 1], c=t, cmap='viridis') +axs[1].set_title('UMAP 2D Projection') +plt.show() + +""" + - The manual projection (using PCA) simply projects + onto the axes of maximum variance. + - UMAP will try to preserve local neighbor relationships, + possibly revealing more structure + +3. Does UMAP have an "explained variance ratio" like + PCA? + + No, UMAP does not provide an explained variance ratio. + - PCA is a linear method and each principal component + explains a portion of the total variance, which + can be quantified + - UMAP is nonlinear and focuses on preserving local + structure (neighborhoods), not maximizing variance. + There is no direct analog to explained variance ratio + in UMAP. + - If you need to quantify how much information + is preserved, you can use trustworthiness, + continuity, or reconstruction error, + but these are not provided by default in UMAP. + +Summary: +- UMAP is for nonlinear neighbor-preserving embedding, + not variance maximization. +- No explained variance ratio is available for UMAP. +""" + +""" +Personal note: +1. What do you mean by "It aims to preserve both local and some global structure of high-dimensional data when projecting it into a lower-dimensional space"? What do you mean by local and global structure? What is local and global structure? + + Local structure refers to the relationships between + each data point and its nearest neighbors–how close + or similar points are to each other in small + neighborhoods. + - Example: If two points are very close in the + original high-dimensional space, a method + that preserves local structure will keep them + close in the low-dimensional embedding + + Global structure refers to the overall arangement + and relationships between distant groups or + clusters in the data–the "big picture" of how all + points and clusters relate to each other + - Example: if there are three well-separated + clusters in high-dimensional space, a method that + preserves global structure will keep those clusters + separated and in roughly the same arrangement + in the low-dimensional embedding + + In summary: + - Local structure: Small-scale relationships (nearest + neighbors, small clusters). + - Global structure: Large-scale relationships + (cluster separation, overall data geometry). + + UMAP tries to preerve botH: + - It keeps neighbors together (local structure) + - It also tries to maintain the relative + positions of clusters or groups (some global + structure), though not as strictly as local structure +""" + +""" +Personal note: +1. Does PCA aims to preserve both local and some global structure of high-dimensional data when projecting it into a lower-dimensional space too? + PCA primarily aims to preserve global structure–it + projects data onto new axes (principal components) + that capture the most variance in the entire dataset. + This means PCA tries to maintain the overall shape, + spread, and large-scale relationships of the data. + + However, PCA does not explicitly preserve local + structure (the relationships between nearest + neighbors). Sometimes, local structure is preserved + as a side effect if it aligns with the directions + of high variance, but PCA does not optimize for local + neighborhoods + + Summary: + - PCA: Preserves global structure (variance, overall + geometry), but not specifically local structure. + - UMAP: Explicitly preserves local structure + (neighbor relationships) and tries to maintain + some global structure +""" + +# %% \ No newline at end of file diff --git a/src/plot_pca.py b/src/plot_pca.py index 38c7730..b1c6b35 100644 --- a/src/plot_pca.py +++ b/src/plot_pca.py @@ -1,4 +1,5 @@ # %% + import os import json import torch @@ -10,7 +11,7 @@ # Assuming these utils are in the same directory or accessible from utils_load_data import load_embeds, load_split_paragraphs -from utils_sonar import load_tokenizer # Or use the alternative below +# from utils_sonar import load_tokenizer # Or use the alternative below # --- Alternative Tokenizer Loading --- # try: @@ -32,6 +33,8 @@ import phate # Import PHATE import plotly.express as px # For interactive plots +# %% + # --- Configuration --- unit = "Characters" # Or "Tokens" MAX_FILES_TO_LOAD = 100 # Adjust as needed for memory/time @@ -42,6 +45,8 @@ INTERACTIVE_SUBSAMPLE_SIZE = 2000 RANDOM_STATE = 42 # For reproducibility +# %% + # --- Load Tokenizer --- # tokenizer = load_tokenizer() # Uncomment if using the alternative loading @@ -106,6 +111,7 @@ def load_data_for_analysis(max_files=MAX_FILES_TO_LOAD): # Return numpy array for vectors, lists/numpy arrays for metadata return vec_array, all_texts, np.array(all_lengths), np.array(all_labels) +# %% # --- Plotting Functions --- def plot_dimensionality_reduction_static(results, labels, lengths, title): @@ -240,7 +246,8 @@ def run_tsne_analysis(data, texts, labels, lengths): print("\nRunning t-SNE...") tsne_model = TSNE( n_components=2, perplexity=30, learning_rate='auto', - n_iter=300, init='pca', random_state=RANDOM_STATE, + n_iter_without_progress=300, + init='pca', random_state=RANDOM_STATE, n_jobs=-1, verbose=1 ) # Fit t-SNE on the potentially pre-subsampled data passed to this function @@ -297,6 +304,59 @@ def run_phate_analysis(data, texts, labels, lengths): ) return phate_results +# %% + +""" +Personal note: +1. You are an expert in Principal Component + Analysis (PCA), Uniform Manifold Approximation + and Projection (UMAP), t-distributed stochastic + neighbor embedding (t-SNE), + Potential of Heat-diffusion for Affinity-based + Trajectory Embedding (PHATE). What is the goal + of PCA? What is the goal of UMAP? What is the + goal of t-SNE? What is the goal of PHATE? + + PCA (Principal Component Analysis): + The goal of PCA is to reduce dimensionality of + data by finding new orthogonal axes + (principal components) that capture the maximum + variance in the data. It projects the data onto + these axes, allowing you to represent high- + dimensional data in fewer dimensions + (often 2 or 3) while preserving as much + as information (variance) as possible + + UMAP (Uniform Manifold Approximation and Projection): + UMAP aims to reduce dimensionality while preserving + both local and some global structure of the data. + It is based on manifold learning and constructs + a high-dimensional graph, + then optimizes a low-dimensional graph to be + as structurally similar as possible. + UMAP is often used for visualization and clustering, + and is faster and more scalable than t-SNE + + t-SNE (t-distributed Stochastic Neighbor Embedding): + The goal of t-SNE is to visualize high-dimensional + data by reducing it to two or three dimensions, + focusing on preserving local similarities (i.e., + points that are close in high-dimensional space + remain close in the low-dimensional embedding). + It is particularly good for visualizing clusters, + but does not preserve global structure well. + + PHATE (Potential of Heat-difussion for Affinity- + based Trajectory Embedding): + PHATE is designed to capture both local and + global nonlinear structure in high-dimensional + data, especially for data with continuous trajectories + (e.g., biological processes). It uses diffusion + geometry to model transitions and relationships, + producing embeddings that reveal both clusters + and progression/trajectory patterns +""" + # --- Main Execution --- if __name__ == "__main__": # 1. Load Data (including texts now) @@ -334,10 +394,6 @@ def run_phate_analysis(data, texts, labels, lengths): print("\nAnalysis complete.") -# %% - -print("hello") - # %% def plot_dimensionality_reduction_interactive_length(results, texts, labels, lengths, title): """Plots interactive results using Plotly, coloring by length."""