-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRoboTF_LLM_Token_Estimator.py
executable file
·123 lines (100 loc) · 5.27 KB
/
RoboTF_LLM_Token_Estimator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
RoboTF LLM Token Estimator - Streamlit Application
This Python script creates a Streamlit web application that allows users to estimate the number of tokens
for a given text input using a specified Hugging Face model's tokenizer. The tokenizer is loaded using
the `autotiktokenizer` library, which supports a wide range of models available on the Hugging Face Hub.
Features:
- User-friendly interface for entering the model name and text input.
- Displays the estimated token count, original text length, and cleaned text length.
- Supports any model with a `tokenizer.json` in the Hugging Face repository.
- Provides example model names and a link to browse models on the Hugging Face Hub.
- Includes detailed instructions on how the token estimation works.
- Caches the tokenizer for efficiency using Streamlit's `st.cache_resource`.
Dependencies:
- streamlit
- PIL (Python Imaging Library)
- autotiktokenizer
Usage:
1. Run the script using Streamlit: `streamlit run RoboTF_LLM_Token_Estimator.py`
2. Enter the Hugging Face model name and the text you want to analyze.
3. Click the 'Count Tokens' button to see the estimated token count and other details.
Example Model Names:
- mistralai/Mistral-7B-Instruct-v0.3
- deepseek-ai/DeepSeek-Coder-V2-Instruct-0724
- Qwen/Qwen2.5-Coder-32B-Instruct
Note:
- The application removes newline characters from the input text before tokenization.
- The tokenizer is loaded from the specified model repository on Hugging Face.
- For models that are quantized or re-trained, point to the original model repository containing `tokenizer.json`.
Author: Keith Kacsh
GitHub: https://github.com/kkacsh321/robotf-llm-token-estimator
"""
import streamlit as st
from PIL import Image
from autotiktokenizer import AutoTikTokenizer
def main():
st.set_page_config(page_title="LLM Token Estimator", layout="wide", page_icon="images/favicon.ico")
try:
image = Image.open("images/robot_token.jpg")
st.image(image, width=400)
except FileNotFoundError as e:
st.error(f"The image file 'robot_token.jpg' was not found: {e}")
except Exception as e:
st.error(f"Failed to load image: {e}")
# Title and description
st.title("RoboTF LLM Token Estimator")
st.write("Enter text below to count the number of tokens for your specific LLM Model.")
# Provide example model names and link to Hugging Face
st.write("Example model names: `mistralai/Mistral-7B-Instruct-v0.3`, `deepseek-ai/DeepSeek-Coder-V2-Instruct-0724`, `Qwen/Qwen2.5-Coder-32B-Instruct`")
st.markdown("[Browse models on Hugging Face Hub](https://huggingface.co/models) 🤗")
st.markdown("Should support any model with a `tokenizer.json` in the HuggingFace repo")
st.markdown("If using a quant, or re-train, point at original model repo with `tokenizer.json`")
st.markdown("This is only an estimator based on [autotiktokenizer](https://github.com/bhavnicksm/autotiktokenizer) project - Go Support them")
st.markdown("---")
st.markdown("How this works:")
st.markdown("You enter the user/model-repo-name that contains a `tokenizer.json` from huggingface")
st.markdown("It removes any newline characters in the input")
st.markdown("We use autotiktokenizer to take your prompt input and estimate the tokens for that model")
st.markdown("Hope you find useful and can find the Github project here: [RoboTF LLM Token Estimator](https://github.com/kkacsh321/robotf-llm-token-estimator)")
# Function to load tokenizer with caching
@st.cache_resource
def load_tokenizer(model_name):
return AutoTikTokenizer.from_pretrained(model_name)
# Create a form for the inputs and button
with st.form(key='token_count_form'):
# Text input for the model name
model_name = st.text_area(
"Enter the Hugging Face model name: (org/model_name)",
placeholder="mistralai/Mistral-7B-Instruct-v0.3", height=10 # Default model name
)
# Text area for the user's input text
user_input = st.text_area("Your text:", height=200)
# Submit button
submit_button = st.form_submit_button(label='Count Tokens')
# When the form is submitted
if submit_button:
if model_name:
# Initialize the tokenizer
try:
tokenizer = load_tokenizer(model_name)
except Exception as e:
st.error(f"Error loading tokenizer for model '{model_name}': {e}")
st.stop()
if user_input:
# Replace newlines with spaces in the user input
cleaned_input = user_input.replace('\n', ' ')
# Tokenize the cleaned input text
tokens = tokenizer.encode(cleaned_input)
token_count = len(tokens)
# Display results
st.subheader("Results")
st.write(f"**Model:** {model_name}")
st.write(f"**Estimated Token count:** {token_count}")
st.write(f"**Original Text Length:** {len(user_input)} characters")
st.write(f"**Cleaned Text Length:** {len(cleaned_input)} characters")
else:
st.warning("Please enter some text to count tokens.")
else:
st.warning("Please enter a model name to proceed.")
if __name__ == "__main__":
main()