update cache GC in demo and add vocab expansion example

This commit is contained in:
yangapku
2023-10-09 14:59:57 +08:00
parent 343017c4ce
commit cbfaada8de
8 changed files with 530 additions and 9 deletions

View File

@@ -9,8 +9,6 @@
"name": "stderr",
"output_type": "stream",
"text": [
"TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
]
}
@@ -414,6 +412,142 @@
"source": [
"tokenizer._convert_token_to_id('<|extra_204|>')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Vocabulary Expansion"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'input_ids': [35946, 99639, 91680, 100472], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer(\"我是一只猫\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[99639, 91680, 100472]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.encode(\"是一只猫\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True, extra_vocab_file=\"qwen_extra.tiktoken\")\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"151857"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tokenizer)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'input_ids': [151854], 'token_type_ids': [0], 'attention_mask': [1]}"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer(\"我是一只猫\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'我是一只猫'"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.decode(tokenizer.encode(\"我是一只猫\"))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[151853]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.encode(\"是一只猫\")"
]
}
],
"metadata": {
@@ -432,7 +566,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.10.12"
},
"orig_nbformat": 4
},