Py.Cafe

solara/

tokenizer

Tokenization Visualizer for GPT-4

DocsPricing
  • app.py
  • requirements.txt
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""# GPT-4 token encoder and decoder

This example shows how to use a language model tokenizer to encode and decode and how to search for a token

Inspired by: [Understanding GPT tokenizers by Simon Willison](https://simonwillison.net/2023/Jun/8/gpt-tokenizers/)


## Note

Install pandas and tiktoken with `pip install pandas tiktoken`

"""

import random

import pandas as pd
import tiktoken

import solara
import solara.lab

# Get tokenizer for gpt-4
tokenizer = tiktoken.encoding_for_model("gpt-4")

# Create dataframe mapping token IDs and tokens
MAX_TOKENS = 50257
df = pd.DataFrame()
df["token ID"] = range(MAX_TOKENS)
df["token"] = [tokenizer.decode([i]) for i in range(MAX_TOKENS)]

sentence = solara.reactive("Example text is here")
tokens_ids_to_lookup = solara.reactive("")
tokens_filter = solara.reactive("")


@solara.component
def GithubAvatar(name: str, handle: str, img: str):
    with solara.v.Html(tag="a", attributes={"href": f"https://github.com/{handle}/", "target": "_blank"}):
        with solara.v.ListItem(class_="pa-0"):
            with solara.v.ListItemAvatar(color="grey darken-3"):
                solara.v.Img(
                    class_="elevation-6",
                    src=img,
                )
            with solara.v.ListItemContent():
                solara.v.ListItemTitle(children=["By " + name])


@solara.component
def Token(token: int):
    # create random color dependent on the position
    random.seed(token)
    random_color = "".join([random.choice("0123456789ABCDEF") for k in range(6)])
    with solara.Div(style="display: inline;"):
        with solara.Div(
            style={
                "display": "inline",
                "padding": "6px",
                "border-right": "3px solid white",
                "line-height": "3em",
                "font-family": "courier",
                "background-color": f"#{random_color}",
                "color": "white",
                "position": "relative",
            },
        ):
            solara.Text(
                " " + str(token),
                style={
                    "display": "inline",
                    "position": "absolute",
                    "top": "5.5ch",
                    "line-height": "1em",
                    "left": "-0.5px",
                    "font-size": "0.45em",
                },
            )
            solara.Text(str(tokenizer.decode([token])))


@solara.component
def Page():
    with solara.Column(margin=10):
        solara.Markdown("# GPT-4 token encoder and decoder")
        solara.Markdown("This is an educational tool for understanding how tokenization works.")
        GithubAvatar(
            "Alonso Silva Allende",
            "alonsosilvaallende",
            "https://avatars.githubusercontent.com/u/30263736?v=4",
        )

        solara.InputText("Enter text to tokenize it:", value=sentence, continuous_update=True)
        tokens = tokenizer.encode(sentence.value)
        with solara.Div(style="display: inline;"):
            for token in tokens:
                Token(token)

        solara.InputText("Or convert space separated tokens to text:", value=tokens_ids_to_lookup, continuous_update=True)
        token_input = [int(span) for span in tokens_ids_to_lookup.value.split(" ") if span != ""]
        text_output = tokenizer.decode(token_input)
        solara.Markdown(f"{text_output}")

        solara.Markdown("## Search tokens")
        solara.InputText("Search for a token:", value=tokens_filter, continuous_update=True)
        df_subset = df[df["token"].str.startswith(tokens_filter.value)]
        solara.Markdown(f"{df_subset.shape[0]:,} results")
        solara.DataFrame(df_subset, items_per_page=10)
requirements.txt
1
2
3
solara
pandas
https://py.cafe/files/maartenbreddels/tiktoken-demo/tiktoken-0.7.0-cp312-cp312-pyodide_2024_0_wasm32.whl