Py.Cafe

alonsosilvaallende/

solara-transformers_js_py_chat

Sentiment Analysis Solara app using Transformers-JS-Py

DocsPricing
  • app.py
  • requirements.txt
  • transformers3.js
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import solara
import shutil
from transformers_js_py import import_transformers_js
import numpy as np
import js
from pathlib import Path

clicks = solara.reactive(0)

input_text = solara.reactive("How are you doing today?")
progress_tokenizer = solara.reactive(0)
progress_model = solara.reactive(0)
messages = solara.reactive([])

js.pyodide.setDebug(True)

model_id = 'Xenova/Phi-3-mini-4k-instruct'



# hack to fetch the file from the webworker, will break in the future
pageId = js._pageId
url = 'https://py.cafe/_app/static/public/transformers3.js?_pageId=' + pageId
public = Path(__file__).parent.parent / 'public'
if not public.exists():
    public.mkdir(parents=True, exist_ok=True)
    name = "transformers3.js"
    jsfile = Path(name)
    shutil.copyfile(jsfile, public / name)

# bug in solara, after saving, refresh the browser preview
# otherwise the old reference to the function is kept
@solara.lab.task
async def run( message: str):
    transformers = await import_transformers_js(url)
    AutoTokenizer = transformers.AutoTokenizer
    AutoModelForCausalLM = transformers.AutoModelForCausalLM

    def progress_callback(*args):
        info =  args[0].to_py()
        if info['status'] == 'progress':
            progress_tokenizer.value = info['progress']

    tokenizer = await AutoTokenizer.from_pretrained(model_id, {
            "legacy": True,
            "progress_callback": progress_callback,
    })

    def progress_callback(*args):
        info =  args[0].to_py()
        if info['status'] == 'progress':
            progress_model.value = info['progress']

    model = await AutoModelForCausalLM.from_pretrained(model_id, {
            "dtype": 'q4',
            "device": 'webgpu',
            "use_external_data_format": True,
            "progress_callback": progress_callback,
        });




    messages.value = [*messages.value, { "role": "user", "content": message }]

    print("messages.value", messages.value)
    inputs = tokenizer.apply_chat_template(messages.value, {
        "add_generation_prompt": True,
        "return_dict": True,
    });
    from pyodide.ffi import create_proxy
    # is this needed? Seems to have no effect
    # inputs["input_ids"] = create_proxy(inputs["input_ids"])
    # inputs["attention_mask"] = create_proxy(inputs["attention_mask"])
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    print("attention_mask", attention_mask, attention_mask.dims)
    print("inputs", inputs, input_ids.dims)
    print("inputs", inputs, input_ids.tolist())

    # convert to uint32, otherwise pyodide complaints with
    # Unknown typed array type 'BigInt64Array'. This is a problem with Pyodide, please open an issue about it here: https://github.com/pyodide/pyodide/issues/new
    if 0:
        if 1:
            ar = np.array(input_ids.tolist(), dtype=np.uint32)
            inputs["input_ids"] = transformers.Tensor(ar.flatten()).unsqueeze(1)
            ar = np.array(attention_mask.tolist(), dtype=np.uint32)
            inputs["attention_mask"] = transformers.Tensor(ar.flatten()).unsqueeze(1)
            # but we do get after calling model.generate()
            # TypeError: Cannot mix BigInt and other types, use explicit conversions
        else:
            # Not sure why we cannot do this:
            # We get this error:
            # pyodide.ffi.JsException: TypeError: Cannot convert a BigInt value to a number
            inputs["input_ids"] = inputs["input_ids"].to('uint32')
            inputs["attention_mask"] = inputs["attention_mask"].to('uint32')

    TextStreamer = transformers.TextStreamer

    # class MyStreamer(TextStreamer):
    #     def on_finalized_text(text):
    #         self.cb(text)

    # streamer = TextStreamer.new(create_proxy(tokenizer), {
    #     "skip_prompt": True,
    #     "skip_special_tokens": True,
    # })
    # stopping_criteria = transformers.StoppingCriteria.new();

    arg = {
        **inputs,
        "max_new_tokens": 512,
        # "streamer": streamer,
        # "stopping_criteria": stopping_criteria,
    }

    print("arg", arg)
    outputs = await model.generate((arg)); # error happens here
    # print(tokenizer, streamer, stopping_criteria, inputs)


    print(tok, model)
    return "dummpy"


@solara.lab.task
async def has_shader_f16():
    if not js.navigator.gpu:
        return False
    adapter = await js.navigator.gpu.requestAdapter();
    if not adapter:
        return False
    return adapter.features.has('shader-f16');

@solara.component
def Page():
    solara.use_memo(lambda: has_shader_f16(), [])
    if has_shader_f16.pending:
        solara.ProgressLinear()
    else:
        if has_shader_f16.value:
            with solara.Card("Test LLM"):
                solara.ProgressLinear(run.pending)
                with solara.Div():
                    solara.InputText(label="Input", value=input_text)
                    solara.Button(label=f"Respond", on_click=lambda: run(input_text.value), color="primary", filled=True)
                if run.finished:
                    solara.Text(repr(run.value))
        else:
            solara.Error("no fp16 support")