diff --git a/public/assets/dense_model.png b/public/assets/dense_model.png new file mode 100644 index 0000000..a418784 Binary files /dev/null and b/public/assets/dense_model.png differ diff --git a/public/assets/expert_layers.png b/public/assets/expert_layers.png new file mode 100644 index 0000000..7b1e764 Binary files /dev/null and b/public/assets/expert_layers.png differ diff --git a/public/assets/expert_specialization.png b/public/assets/expert_specialization.png new file mode 100644 index 0000000..891c44d Binary files /dev/null and b/public/assets/expert_specialization.png differ diff --git a/public/assets/importance_histogram.png b/public/assets/importance_histogram.png new file mode 100644 index 0000000..b12fea8 Binary files /dev/null and b/public/assets/importance_histogram.png differ diff --git a/public/assets/routing_diagram.png b/public/assets/routing_diagram.png new file mode 100644 index 0000000..3beda2e Binary files /dev/null and b/public/assets/routing_diagram.png differ diff --git a/public/assets/softmax.png b/public/assets/softmax.png new file mode 100644 index 0000000..a5052e1 Binary files /dev/null and b/public/assets/softmax.png differ diff --git a/public/assets/sparse_model.png b/public/assets/sparse_model.png new file mode 100644 index 0000000..90bcb63 Binary files /dev/null and b/public/assets/sparse_model.png differ diff --git a/public/assets/x_times_w.png b/public/assets/x_times_w.png new file mode 100644 index 0000000..e62932c Binary files /dev/null and b/public/assets/x_times_w.png differ diff --git a/src/App.tsx b/src/App.tsx index 3df9b13..b0ba77a 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -1,6 +1,7 @@ import { BrowserRouter, Routes, Route } from 'react-router-dom' import LandingPage from './components/layout/LandingPage' import VisualizerPage from './components/layout/VisualizerPage' +import DocsPage from './components/layout/DocsPage' function App() { return ( @@ -8,6 +9,7 @@ function App() { } /> } /> + } /> ) diff --git a/src/components/layout/DocsPage.module.css b/src/components/layout/DocsPage.module.css new file mode 100644 index 0000000..8b5761d --- /dev/null +++ b/src/components/layout/DocsPage.module.css @@ -0,0 +1,355 @@ +.container { + min-height: 100vh; + display: flex; + flex-direction: column; + background-color: var(--color-background); +} + +/* Header */ +.header { + position: sticky; + top: 0; + z-index: 100; + background: rgba(15, 23, 42, 0.8); + backdrop-filter: blur(10px); + border-bottom: 1px solid rgba(148, 163, 184, 0.1); +} + +.headerContent { + max-width: 1200px; + margin: 0 auto; + padding: var(--spacing-md) var(--spacing-lg); + display: flex; + justify-content: space-between; + align-items: center; +} + +.logo { + display: flex; + align-items: center; + gap: var(--spacing-sm); + text-decoration: none; + font-size: 1.25rem; + font-weight: 600; + color: var(--color-text); +} + +.logoText { + background: linear-gradient(135deg, var(--color-primary), var(--color-accent)); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} + +.nav { + display: flex; + gap: var(--spacing-lg); +} + +.nav a { + color: var(--color-text-secondary); + text-decoration: none; + font-weight: 500; + transition: color var(--transition-fast); +} + +.nav a:hover { + color: var(--color-text); +} + +/* Main Content */ +.main { + flex: 1; + padding: var(--spacing-xl) var(--spacing-lg); +} + +.content { + max-width: 900px; + margin: 0 auto; + background: var(--color-surface); + border: 1px solid var(--color-surface-light); + border-radius: var(--radius-xl); + padding: var(--spacing-xl); + box-shadow: var(--shadow-lg); +} + +.content h1 { + font-size: 2.5rem; + background: linear-gradient(135deg, var(--color-primary), var(--color-accent)); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; + margin-bottom: var(--spacing-xl); + border-bottom: 2px solid var(--color-surface-light); + padding-bottom: var(--spacing-md); +} + +.content h2 { + font-size: 2rem; + color: var(--color-text); + margin-top: var(--spacing-xl); + margin-bottom: var(--spacing-md); +} + +.content h3 { + font-size: 1.5rem; + color: var(--color-text); + margin-top: var(--spacing-lg); + margin-bottom: var(--spacing-sm); +} + +.content h4 { + font-size: 1.2rem; + color: var(--color-text); + margin-top: var(--spacing-md); + margin-bottom: var(--spacing-sm); +} + +.content p { + line-height: 1.8; + color: var(--color-text-secondary); + margin-bottom: var(--spacing-md); +} + +.content ul, +.content ol { + margin-left: var(--spacing-lg); + margin-bottom: var(--spacing-md); + line-height: 1.8; +} + +.content li { + margin-bottom: var(--spacing-sm); + color: var(--color-text-secondary); +} + +.content code { + background: var(--color-background); + padding: 2px 6px; + border-radius: var(--radius-sm); + font-family: 'Courier New', monospace; + font-size: 0.9em; + color: var(--color-accent); + border: 1px solid var(--color-surface-light); +} + +/* Sections */ +.section { + margin-bottom: var(--spacing-xl); +} + +.step { + margin-bottom: var(--spacing-lg); + padding-left: var(--spacing-md); + border-left: 3px solid var(--color-primary); +} + +.feature { + margin-bottom: var(--spacing-lg); + padding: var(--spacing-md); + background: var(--color-background); + border-radius: var(--radius-md); + border-left: 4px solid var(--color-accent); +} + +.techDetail { + margin-bottom: var(--spacing-lg); + padding: var(--spacing-md); + background: var(--color-background); + border-radius: var(--radius-md); + border: 1px solid var(--color-surface-light); +} + +/* Info Box */ +.infoBox { + background: rgba(99, 102, 241, 0.1); + border-left: 4px solid var(--color-primary); + padding: var(--spacing-md); + border-radius: var(--radius-sm); + margin: var(--spacing-md) 0; +} + +.infoBox strong { + color: var(--color-primary); +} + +/* Formula */ +.formula { + display: block; + background: var(--color-background); + color: var(--color-text); + padding: var(--spacing-md); + border-radius: var(--radius-md); + margin: var(--spacing-md) 0; + font-family: 'Courier New', monospace; + overflow-x: auto; + border: 1px solid var(--color-surface-light); +} + +/* Code Block */ +.codeBlock { + background: var(--color-background); + border: 1px solid var(--color-surface-light); + border-radius: var(--radius-md); + padding: var(--spacing-md); + margin: var(--spacing-md) 0; + overflow-x: auto; +} + +.codeBlock code { + display: block; + font-family: 'Consolas', 'Monaco', 'Courier New', monospace; + font-size: 0.95em; + line-height: 1.6; + color: var(--color-text); + background: none; + border: none; + padding: 0; +} + +/* Note */ +.note { + font-style: italic; + color: var(--color-text-secondary); + font-size: 0.95em; + opacity: 0.8; +} + +/* Images */ +.imageContainer { + margin: var(--spacing-lg) 0; + text-align: center; +} + +.image { + width: 100%; + max-width: 500px; + height: auto; + border-radius: var(--radius-md); + border: 1px solid var(--color-surface-light); + box-shadow: var(--shadow-md); +} + +.imageCaption { + margin-top: var(--spacing-sm); + font-size: 0.9em; + color: var(--color-text-secondary); + font-style: italic; +} + +/* Steps */ +.steps { + counter-reset: step-counter; + list-style: none; + margin-left: 0; +} + +.steps li { + counter-increment: step-counter; + position: relative; + padding-left: var(--spacing-xl); + margin-bottom: var(--spacing-md); +} + +.steps li::before { + content: counter(step-counter); + position: absolute; + left: 0; + top: 0; + width: 28px; + height: 28px; + background: linear-gradient(135deg, var(--color-primary), var(--color-secondary)); + color: white; + border-radius: 50%; + display: flex; + align-items: center; + justify-content: center; + font-weight: bold; + font-size: 0.9em; +} + +/* Resources */ +.resources { + list-style: none; + margin-left: 0; +} + +.resources li { + margin-bottom: var(--spacing-md); +} + +.resources a { + color: var(--color-primary); + text-decoration: none; + font-weight: 500; + transition: color var(--transition-fast); +} + +.resources a:hover { + color: var(--color-accent); + text-decoration: underline; +} + +/* Footer */ +.footer { + padding: var(--spacing-xl) var(--spacing-lg); + background: var(--color-background); + border-top: 1px solid rgba(148, 163, 184, 0.1); + margin-top: auto; +} + +.footerContent { + max-width: 1200px; + margin: 0 auto; + display: flex; + justify-content: space-between; + align-items: center; + flex-wrap: wrap; + gap: var(--spacing-md); +} + +.footerContent p { + color: var(--color-text-secondary); + margin: 0; +} + +.footerLinks { + display: flex; + gap: var(--spacing-lg); +} + +.footerLinks a { + color: var(--color-text-secondary); + text-decoration: none; + transition: color var(--transition-fast); +} + +.footerLinks a:hover { + color: var(--color-text); +} + +/* Responsive */ +@media (max-width: 768px) { + .content { + padding: var(--spacing-lg); + } + + .content h1 { + font-size: 2rem; + } + + .content h2 { + font-size: 1.5rem; + } + + .content h3 { + font-size: 1.2rem; + } + + .footerContent { + flex-direction: column; + gap: var(--spacing-md); + text-align: center; + } +} + diff --git a/src/components/layout/DocsPage.tsx b/src/components/layout/DocsPage.tsx new file mode 100644 index 0000000..840dbc5 --- /dev/null +++ b/src/components/layout/DocsPage.tsx @@ -0,0 +1,299 @@ +import { Link } from 'react-router-dom' +import styles from './DocsPage.module.css' + +function DocsPage() { + return ( +
+ {/* Header */} +
+
+ + MoE Visualizer + + +
+
+ + {/* Main Content */} +
+
+ {/* What is MoE */} +
+

What the helly is Mixture of Experts?

+

+ Mixture of Experts (MoE) is a neural network architecture that uses multiple specialized + sub-networks (experts) to process different parts of the input. Instead of routing all data + through the same network, a gating network learns to dynamically select which + experts should process each token. +

+ +
+

Key Benefits

+
    +
  • Scalability: Increase model capacity without proportionally increasing compute
  • +
  • Specialization: Experts specialize in different syntactic patterns (e.g. verbs, nouns, adjectives, etc.)
  • +
  • Sparse activation: Only activate a subset of experts per token
  • +
+
+
+ + {/* How it Works */} +
+

MoE consists of two key components:

+
    +
  • + Router (Gate Network): A learned network that decides which experts should process + each token. For every input, it computes a score for each expert and selects the top-K + highest-scoring ones to handle that token. +
  • +
  • + Experts: A set of specialized Feed-Forward Neural Networks (FFNNs). Instead of + one shared FFNN processing all tokens, MoE has multiple expert FFNNs, each learning to handle + different patterns or input types (Usually syntactic patterns like verbs, nouns, adjectives, etc.). +
  • +
    + Expert Specialization +

    Each expert specializes in different syntactic patterns during training

    +
    +
+ +

Sparse vs. Dense Models

+

+ To understand MoE, it's important to contrast it with traditional dense models: +

+ +
+

Dense Model (Traditional)

+

+ In a standard transformer, every token passes through the same Feed-Forward Neural Network (FFNN) + at each layer. This means: +

+
    +
  • All parameters are activated for every token
  • +
  • Computation scales linearly with model size
  • +
  • Simple and stable, but inefficient for very large models
  • +
+
+ Dense MoE Model Architecture +

Dense MoE architecture: All experts are selected and activated per token

+
+
+ +
+

Sparse Model (MoE)

+

+ In MoE, each FFNN layer is replaced by multiple expert FFNNs, but only a subset + of experts process each token. This means: +

+
    +
  • Only top-K experts are activated per token (sparse activation)
  • +
  • Computation remains constant regardless of total expert count
  • +
  • More complex to train, but enables massive model scaling
  • + +
+
+ Sparse MoE Model Architecture +

Sparse MoE architecture: Only selected experts (highlighted) are activated per token

+
+
+ + + +

Step-by-Step Process:

+ +

1. Gating Network (Router)

+
+ MoE Routing Diagram +

Token routing process: Gating → Top-K Selection → Expert Processing

+
+

+ For each input token, the gating network computes a score for every expert. +

+ +

Step 1a: Linear Transformation

+

+ First, the token embedding is multiplied by the gating weight matrix W_gate: +

+
+                h = token_embedding * W_gate
+              
+ +
+ Matrix multiplication h = x × W +

Linear transformation: token embedding (x) multiplied by weight matrix (W)

+
+ +

+ Where: +

+
    +
  • token_embedding: Vector representation of the input token (e.g., 512 dimensions)
  • +
  • W_gate: Learned weight matrix (e.g., 512 × num_experts)
  • +
  • h: Raw logits/scores for each expert (one score per expert)
  • +
+ +

Step 1b: Softmax Normalization

+

+ The raw scores are then normalized using the softmax function to produce probabilities: +

+
+                scores = softmax(h)
+              
+ +
+ Softmax function +

Softmax converts raw scores into a probability distribution that sums to 1

+
+ +

+ Softmax ensures all scores are between 0-1 and all scores sum to exactly 1. The result is a probability distribution over all experts, indicating how suitable each expert + is for processing this particular token. +

+

Step 1c: Repeat!

+
+ Expert FFN Layers +

Multi-layer MoE architecture: Each token goes through multiple MoE layers

+
+

+ At each MoE layer, the router independently computes scores and selects experts for every token. + This means a token may be routed to different experts at different layers. + Each layer's routing decision is independent and learned during training. +

+

2. Top-K Selection

+

+ Instead of using all experts, we select only the top-K experts with + the highest scores. Common values: K=1 or K=2. +

+ +
+ Trade-off: Higher K = more compute but potentially better quality. + Lower K = faster but experts must be more specialized. +
+ +

3. Token Routing

+

+ Each token is routed to its top-K selected experts. Tokens assigned to the same + expert are batched together for efficient processing. +

+
+

Batch Processing

+

+ Tokens routed to the same expert are batched together for efficiency: +

+
    +
  • Input shape changes from [1, 512] to [batch_size, 512]
  • +
  • Processing time scales with batch size
  • +
  • All tokens in a batch complete simultaneously
  • +
+
+

4. Expert Processing (FFN)

+

+ Each expert is a Feed-Forward Network (FFN) that transforms the input: +

+
+                FFN(x) = W₂ × ReLU(W₁ × x)
+              
+ +

+ Where: +

+
    +
  • W₁: First linear layer (token_embedding → hidden_dimensions)
  • +
  • ReLU: Activation function (element-wise)
  • +
  • W₂: Second linear layer (hidden_dimensions → token_embedding)
  • +
+ +

5. Output Combination

+

+ The outputs from selected experts are weighted by their gating scores and summed: +

+
+                output = Σ (score_i × expert_i(token))
+              
+
+ + {/* Load Balancing */} +
+

Load Balancing Challenge

+

+ A key challenge in MoE is load balancing. Without constraints, the gating + network often learns to overuse a few "favorite" experts while ignoring others. +

+
+

Why This Happens

+
    +
  • The gating network optimizes for accuracy, not balance
  • +
  • Popular experts get more gradient updates, improving faster
  • +
  • Creates a feedback loop: good experts → more use → better experts
  • +
+
+

+ Solutions include auxiliary losses, capacity constraints, and expert dropout to encourage + more balanced routing. +

+
+ + {/* Getting Started */} +
+

Getting Started

+
    +
  1. Navigate to the Demo
  2. +
  3. Enter a text prompt or word in the input box
  4. +
  5. Click "Process Tokens" or press Enter
  6. +
  7. Watch the token get scored, routed, and processed
  8. +
  9. Click on experts to see FFN internals
  10. +
  11. Adjust controls to experiment with different configurations
  12. +
  13. Add more tokens to see batch processing and load distribution
  14. +
+
+ + {/* Resources */} +
+

Further Reading

+ +
+
+
+ + {/* Footer */} + +
+ ) +} + +export default DocsPage + diff --git a/src/components/layout/LandingPage.tsx b/src/components/layout/LandingPage.tsx index 181783e..adffcf7 100644 --- a/src/components/layout/LandingPage.tsx +++ b/src/components/layout/LandingPage.tsx @@ -12,9 +12,7 @@ function LandingPage() { MoE Visualizer @@ -47,13 +45,12 @@ function LandingPage() { {/* Footer */} diff --git a/src/components/layout/VisualizerPage.tsx b/src/components/layout/VisualizerPage.tsx index 38847da..f647d13 100644 --- a/src/components/layout/VisualizerPage.tsx +++ b/src/components/layout/VisualizerPage.tsx @@ -33,8 +33,6 @@ function VisualizerPage() {