Skip to content

Add Terminal-Based Visualization Tool for Tokenized Data Points in Tiktoken Tokenizer #314

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,8 @@ fancy-regex = "0.13.0"
regex = "1.10.3"
rustc-hash = "1.1.0"
bstr = "1.5.0"
# cli dependencies
tui-textarea = "0.7.0"
crossterm = "0.28.0"
ratatui = "0.29.0"

1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
include *.png
include *.svg
include *.toml
include *.md
Expand Down
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,14 @@ setup(
Then simply `pip install ./my_tiktoken_extension` and you should be able to use your
custom encodings! Make sure **not** to use an editable install.


## Tiktoken tokenizer environment

Test your tokenizer through a terminal-based environment that allows you to visualize tokenized data points. This tool helps you better grasp model information by providing immediate feedback on how input text is being tokenized. You can see token types, and their positions in the input text, making it easier to understand and debug your tokenizer.

```python
import tiktoken
enc = tiktoken.get_encoding("gpt2")
enc.environment()
```
![image](/environment.png)
Binary file added environment.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
177 changes: 173 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,36 @@ use std::collections::HashSet;
use std::num::NonZeroU64;
use std::thread;

use bstr::ByteSlice;
use fancy_regex::Regex;
#[cfg(feature = "python")]
use pyo3::prelude::*;
use pyo3::pyclass;
use pyo3::PyResult;
use pyo3::types::{PyBytes, PyList, PyTuple};

use rustc_hash::FxHashMap as HashMap;

// so many imports :D
use crossterm::terminal::{
disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen,
};
use ratatui::prelude::{Span, Constraint};
use ratatui::Terminal;
use ratatui::backend::CrosstermBackend;
use ratatui::layout::{Layout, Direction, Margin};
use ratatui::text::Line;
use ratatui::widgets::{
block::Title, Block, Padding, Borders, Paragraph, Scrollbar, ScrollbarOrientation,
ScrollbarState, Wrap,
};
use ratatui::style::{Color, Style};
use tui_textarea::TextArea;

#[cfg(feature = "python")]
mod py;

pub type Rank = u32;
type Rank = u32;

fn _byte_pair_merge(ranks: &HashMap<Vec<u8>, Rank>, piece: &[u8]) -> Vec<(usize, Rank)> {
// This is a vector of (start, rank).
Expand Down Expand Up @@ -513,12 +534,160 @@ impl CoreBPE {
.collect()
}

pub fn encode_with_special_tokens(&self, text: &str) -> Vec<Rank> {
let allowed_special = self.special_tokens();
self.encode(text, &allowed_special).0
fn _environment(&self, name : &str, allowed_special: HashSet<&str>) -> PyResult<()> {
let stdout = std::io::stdout();
let mut stdout = stdout.lock();

enable_raw_mode()?;
crossterm::execute!(stdout, EnterAlternateScreen)?;
let backend = CrosstermBackend::new(stdout);
let mut term = Terminal::new(backend)?;

let mut textarea = TextArea::default();
textarea.set_block(
Block::default()
.borders(Borders::ALL)
.title(format!("{} Encoder",name)).padding(Padding::new(1, 1, 1, 0))
);

let colours = vec![Color::Red, Color::Green, Color::Blue, Color::Yellow, Color::Magenta, Color::Cyan];

let parent_layout = Layout::default()
.constraints([Constraint::Percentage(100), Constraint::Min(1)]);

let layout = Layout::default()
.direction(Direction::Horizontal)
.constraints([Constraint::Percentage(50), Constraint::Percentage(50)].as_ref());


let mut vertical_scroll = 0;
let mut line_count : usize = 0;
loop {
let mut current_color_index = 0;
term.draw(|f| {
let chunks = parent_layout.split(f.size());

let sub_chunk = layout.split(chunks[0]);

let tokens: Vec<Vec<String>> = textarea.lines().iter().map(|line| {

// Encode the line
let encoding = self.encode(line, &allowed_special).0;

// Decode the encoded line
let decoding: Vec<Vec<u8>> = encoding.iter()
.map(|&token| self.decode_bytes(&[token]).unwrap())
.collect();

// Convert decoded tokens to Strings
let tokens: Vec<String> = decoding.iter().map(|bytes| {
bytes.to_str()
.unwrap()
.to_string()

}).collect();

tokens
// Create spans for the line

}).collect();

let mut lines : Vec<Line> = Vec::new();
let mut token_count = 0;
for token in &tokens{
let span : Vec<Span> = token.iter().map(|token| {
let color = colours[current_color_index];
current_color_index = (current_color_index + 1) % colours.len();
token_count += 1; // Increment the token count
Span::styled(token, Style::default().bg(color).fg(Color::White))
}).collect();
lines.push(Line::from(span));

}


let scrollbar = Scrollbar::new(ScrollbarOrientation::VerticalRight)
.begin_symbol(Some("↑"))
.end_symbol(Some("↓"));


let paragraph = Paragraph::new(lines.clone())
.block(Block::default().borders(Borders::ALL)
.title("Decoded Tokens")
.title(Title::from(Line::from(vec![
Span::styled(token_count.to_string(),
Style::new().fg(Color::Green)),
Span::from(" token(s)")]))
.alignment(ratatui::layout::Alignment::Center)
.position(ratatui::widgets::block::Position::Bottom))
.padding(Padding::new(1, 1, 1, 1)))
.scroll((vertical_scroll as u16, 0))
.wrap(Wrap { trim: true });

let menu: Block<'_> = Block::new()
.title(Title::from("[Esc] Exit").alignment(ratatui::layout::Alignment::Left))
.title(Title::from("[Ctrl+S] Scroll Down").alignment(ratatui::layout::Alignment::Center))
.title(Title::from("[Ctrl+A] Scroll Up").alignment(ratatui::layout::Alignment::Center))
.padding(Padding::horizontal(5u16))
.border_style(Style::default().fg(Color::White))
.borders(Borders::TOP);

line_count = lines.len();
let mut scrollbar_state = ScrollbarState::new(line_count)
.position(vertical_scroll);

f.render_widget(menu, chunks[1]);
f.render_widget(textarea.widget(), sub_chunk[0]);
f.render_widget(paragraph, sub_chunk[1]);

f.render_stateful_widget(
scrollbar,
sub_chunk[1].inner(Margin {
// using an inner vertical margin of 1 unit makes the scrollbar inside the block
vertical: 1,
horizontal: 0,
}),
&mut scrollbar_state,
);

})?;

match crossterm::event::read()? {
crossterm::event::Event::Key(key) => {
match key.code {
crossterm::event::KeyCode::Esc => break,
crossterm::event::KeyCode::Char('s') if key.modifiers.contains(crossterm::event::KeyModifiers::CONTROL) => {
if vertical_scroll < line_count - 1 {
vertical_scroll += 1;
}
},
crossterm::event::KeyCode::Char('a') if key.modifiers.contains(crossterm::event::KeyModifiers::CONTROL) => {
if vertical_scroll > 0 {
vertical_scroll -= 1;
}
},
_ => {
textarea.input(key);
}
}
},
_ => {}
}
}

disable_raw_mode()?;
crossterm::execute!(
term.backend_mut(),
LeaveAlternateScreen,
)?;
term.show_cursor()?;

Ok(())

}
}


#[cfg(test)]
mod tests {
use fancy_regex::Regex;
Expand Down
16 changes: 15 additions & 1 deletion src/py.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,21 @@ impl CoreBPE {
.map(|x| PyBytes::new_bound(py, x).into())
.collect()
}
}

// ====================
// TUI Environment
// ====================

#[pyo3(name = "_environment")]
fn py_environment(&self, _py : Python, name : &str, allowed_special: HashSet<PyBackedStr>) -> PyResult<()>{
// Convert PyBackedStr to &str
let allowed_special: HashSet<&str> =
allowed_special.iter().map(|s| s.as_ref()).collect();

self._environment(name, allowed_special)
}

}

#[pyclass]
struct TiktokenBuffer {
Expand Down
16 changes: 16 additions & 0 deletions tiktoken/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ def encode(
disallowed_special = frozenset(disallowed_special)
if match := _special_token_regex(disallowed_special).search(text):
raise_disallowed_special_token(match.group())

# https://github.com/PyO3/pyo3/pull/3632
if isinstance(allowed_special, frozenset):
allowed_special = set(allowed_special)

try:
return self._core_bpe.encode(text, allowed_special)
Expand Down Expand Up @@ -371,6 +375,18 @@ def n_vocab(self) -> int:
"""For backwards compatibility. Prefer to use `enc.max_token_value + 1`."""
return self.max_token_value + 1

def environment(self,
*,
allowed_special: Literal["all"] | AbstractSet[str] = set(), # noqa: B006
disallowed_special: Literal["all"] | Collection[str] = "all",) -> None:
"""Builds a Text User Interface (TUI) environment to test out encoding."""

if allowed_special == "all":
allowed_special = self.special_tokens_set
if disallowed_special == "all":
disallowed_special = self.special_tokens_set - allowed_special

return self._core_bpe._environment(self.name, allowed_special)
# ====================
# Private
# ====================
Expand Down