mirror of
https://github.com/QwenLM/Qwen.git
synced 2026-05-21 00:45:48 +08:00
Update README.md
This commit is contained in:
11
README.md
11
README.md
@@ -1,4 +1,5 @@
|
|||||||
<br>
|
<br>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="assets/logo.jpg" width="400"/>
|
<img src="assets/logo.jpg" width="400"/>
|
||||||
<p>
|
<p>
|
||||||
@@ -73,6 +74,7 @@ If your device supports fp16 or bf16, we recommend installing [flash-attention](
|
|||||||
```bash
|
```bash
|
||||||
git clone -b v1.0.8 https://github.com/Dao-AILab/flash-attention
|
git clone -b v1.0.8 https://github.com/Dao-AILab/flash-attention
|
||||||
cd flash-attention && pip install .
|
cd flash-attention && pip install .
|
||||||
|
# Below are optional. Installing them might be slow.
|
||||||
pip install csrc/layer_norm
|
pip install csrc/layer_norm
|
||||||
pip install csrc/rotary
|
pip install csrc/rotary
|
||||||
```
|
```
|
||||||
@@ -87,8 +89,7 @@ To use Qwen-7B-Chat for the inference, all you need to do is to input a few line
|
|||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
from transformers.generation import GenerationConfig
|
from transformers.generation import GenerationConfig
|
||||||
|
|
||||||
# Note: For tokenizer usage, please refer to examples/tokenizer_showcase.ipynb.
|
# Note: The default behavior now has injection attack prevention off.
|
||||||
# The default behavior now has injection attack prevention off.
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
|
||||||
|
|
||||||
# use bf16
|
# use bf16
|
||||||
@@ -147,7 +148,7 @@ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto",
|
|||||||
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
|
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
|
||||||
|
|
||||||
inputs = tokenizer('蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是', return_tensors='pt')
|
inputs = tokenizer('蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是', return_tensors='pt')
|
||||||
inputs = inputs.to('cuda:0')
|
inputs = inputs.to(model.device)
|
||||||
pred = model.generate(**inputs)
|
pred = model.generate(**inputs)
|
||||||
print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))
|
print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))
|
||||||
# 蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是亚的斯亚贝巴(Addis Ababa)...
|
# 蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是亚的斯亚贝巴(Addis Ababa)...
|
||||||
@@ -184,6 +185,10 @@ response, history = results['response'], results['history']
|
|||||||
print(f'Response: {response}')
|
print(f'Response: {response}')
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Tokenizer
|
||||||
|
|
||||||
|
Our tokenizer based on tiktoken is different from other tokenizers, e.g., sentencepiece tokenizer. You need to pay attention to special tokens, especially in finetuning. For more detailed information on the tokenizer and related use in fine-tuning, please refer to the [documentation](tokenization_note.md).
|
||||||
|
|
||||||
## Quantization
|
## Quantization
|
||||||
|
|
||||||
We provide examples to show how to load models in `NF4` and `Int8`. For starters, make sure you have implemented `bitsandbytes`. Note that the requirements for `bitsandbytes` are:
|
We provide examples to show how to load models in `NF4` and `Int8`. For starters, make sure you have implemented `bitsandbytes`. Note that the requirements for `bitsandbytes` are:
|
||||||
|
|||||||
Reference in New Issue
Block a user