• HuggingFace的transfomers库


    pipeline

    1. from transformers import pipeline
    2. classifier = pipeline("sentiment-analysis")#自动下载模型和tokenizer
    3. classifier("We are very happy to show you the 🤗 Transformers library.")#[{'label': 'POSITIVE', 'score': 0.9998}]
    4. #输入多句
    5. results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
    6. for result in results:
    7. print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
    8. #可以指定模型
    9. import torch
    10. from transformers import pipeline
    11. from datasets import load_dataset, Audio
    12. speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
    13. dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
    14. dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
    15. result = speech_recognizer(dataset[:4]["audio"])
    16. print([d["text"] for d in result])
    17. #指定device
    18. transcriber = pipeline(model="openai/whisper-large-v2", device=0)
    19. #自动分配device
    20. #pip install --upgrade accelerate
    21. transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")
    22. #batch推理
    23. transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
    24. audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
    25. texts = transcriber(audio_filenames)
    26. #其他参数示例
    27. # pip install accelerate
    28. import torch
    29. from transformers import pipeline
    30. pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
    31. output = pipe("This is a cool example!", do_sample=True, top_p=0.95)

    下面是更多类型,完整列表 

    AutoClass

    AutoTokenizer

    1. from transformers import AutoTokenizer
    2. model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    3. tokenizer = AutoTokenizer.from_pretrained(model_name)
    4. encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
    5. print(encoding)
    6. #指定返回pytorch tensor
    7. pt_batch = tokenizer(
    8. ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    9. padding=True,
    10. truncation=True,
    11. max_length=512,
    12. return_tensors="pt",#如果是tf_tensor则写tf
    13. )

    AutoModel

    1. from transformers import AutoModelForSequenceClassification
    2. model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    3. pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
    4. pt_outputs = pt_model(**pt_batch)
    5. #输出只有概率,后处理要自己做
    6. from torch import nn
    7. pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
    8. print(pt_predictions)
    9. #保存模型
    10. pt_save_directory = "./pt_save_pretrained"
    11. tokenizer.save_pretrained(pt_save_directory)
    12. pt_model.save_pretrained(pt_save_directory)
    13. pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
    14. #torch加载tf保存的模型
    15. from transformers import AutoModel
    16. tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
    17. pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)

    AutoConfig

    1. from transformers import AutoConfig
    2. # Download configuration from huggingface.co and cache.
    3. config = AutoConfig.from_pretrained("bert-base-uncased")
    4. # Download configuration from huggingface.co (user-uploaded) and cache.
    5. config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased")
    6. # If configuration file is in a directory (e.g., was saved using *save_pretrained('./test/saved_model/')*).
    7. config = AutoConfig.from_pretrained("./test/bert_saved_model/")
    8. # Load a specific configuration file.
    9. config = AutoConfig.from_pretrained("./test/bert_saved_model/my_configuration.json")
    10. # Change some config attributes when loading a pretrained config.
    11. config = AutoConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
    12. config.output_attentions
    13. config, unused_kwargs = AutoConfig.from_pretrained(
    14. "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
    15. )
    16. from transformers import AutoModel
    17. my_model = AutoModel.from_config(config)

     

    Trainer

    1. from transformers import AutoModelForSequenceClassification
    2. from transformers import TrainingArguments
    3. from transformers import AutoTokenizer
    4. from datasets import load_dataset
    5. from transformers import DataCollatorWithPadding
    6. from transformers import Trainer
    7. model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
    8. training_args = TrainingArguments(
    9. output_dir="path/to/save/folder/",
    10. learning_rate=2e-5,
    11. per_device_train_batch_size=8,
    12. per_device_eval_batch_size=8,
    13. num_train_epochs=2,
    14. )
    15. tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    16. dataset = load_dataset("rotten_tomatoes") # doctest: +IGNORE_RESULT
    17. def tokenize_dataset(dataset):
    18. return tokenizer(dataset["text"])
    19. dataset = dataset.map(tokenize_dataset, batched=True)
    20. data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    21. trainer = Trainer(
    22. model=model,
    23. args=training_args,
    24. train_dataset=dataset["train"],
    25. eval_dataset=dataset["test"],
    26. tokenizer=tokenizer,
    27. data_collator=data_collator,
    28. ) # doctest: +SKIP
    29. trainer.train()

    安装

    1. pip install transformers
    2. pip install 'transformers[torch]'#只安装torch后端
    3. #源码安装
    4. pip install git+https://github.com/huggingface/transformers
    5. #开发者模式
    6. git clone https://github.com/huggingface/transformers.git
    7. cd transformers
    8. pip install -e .

    tokenizer

    我获取了opt类型的tokenizer,那么enc是什么类型呢?有哪些方法呢?

    1. from transformers import AutoTokenizer
    2. enc = AutoTokenizer.from_pretrained('facebook/opt-125m')

    可以通过print(enc)看到,enc是GPT2TokenizerFast类型,搜索类型的定义,在python安装包的transformers/models/gpt2/tokenization_gpt2_fast.py

    1. class GPT2TokenizerFast(PreTrainedTokenizerFast):
    2. vocab_files_names = VOCAB_FILES_NAMES
    3. pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    4. max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    5. model_input_names = ["input_ids", "attention_mask"]
    6. slow_tokenizer_class = GPT2Tokenizer
    7. def __init__(
    8. self,
    9. vocab_file=None,
    10. merges_file=None,
    11. tokenizer_file=None,
    12. unk_token="<|endoftext|>",
    13. bos_token="<|endoftext|>",
    14. eos_token="<|endoftext|>",
    15. add_prefix_space=False,
    16. **kwargs,
    17. ):
    18. super().__init__(
    19. vocab_file,
    20. merges_file,
    21. tokenizer_file=tokenizer_file,
    22. unk_token=unk_token,
    23. bos_token=bos_token,
    24. eos_token=eos_token,
    25. add_prefix_space=add_prefix_space,
    26. **kwargs,
    27. )
    28. def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
    29. def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
    30. def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
    1. class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
    2. vocab_files_names = VOCAB_FILES_NAMES
    3. slow_tokenizer_class: PreTrainedTokenizer = None
    4. def __init__(self, *args, **kwargs):
    5. tokenizer_object = kwargs.pop("tokenizer_object", None)
    6. slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
    7. fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
    8. from_slow = kwargs.pop("from_slow", False)
    9. added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
    10. @property#属性装饰器的作用在于将成员函数变成成员变量,访问的时候不需要/不能加()
    11. def is_fast(self) -> bool:
    12. @property
    13. def can_save_slow_tokenizer(self) -> bool:
    14. @property
    15. def vocab_size(self) -> int:
    16. def get_vocab(self) -> Dict[str, int]:
    17. @property
    18. def vocab(self) -> Dict[str, int]:
    19. @property
    20. def added_tokens_encoder(self) -> Dict[str, int]:
    21. @property
    22. def added_tokens_decoder(self) -> Dict[int, AddedToken]:
    23. def get_added_vocab(self) -> Dict[str, int]:
    24. def __len__(self) -> int:
    25. @property
    26. def backend_tokenizer(self) -> TokenizerFast:
    27. @property
    28. def decoder(self) -> DecoderFast:
    29. def _convert_encoding(
    30. self,
    31. encoding: EncodingFast,
    32. return_token_type_ids: Optional[bool] = None,
    33. return_attention_mask: Optional[bool] = None,
    34. return_overflowing_tokens: bool = False,
    35. return_special_tokens_mask: bool = False,
    36. return_offsets_mapping: bool = False,
    37. return_length: bool = False,
    38. verbose: bool = True,
    39. ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
    40. def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
    41. def _convert_token_to_id_with_added_voc(self, token: str) -> int:
    42. def _convert_id_to_token(self, index: int) -> Optional[str]:
    43. def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
    44. def num_special_tokens_to_add(self, pair: bool = False) -> int:
    45. def convert_ids_to_tokens(
    46. self, ids: Union[int, List[int]], skip_special_tokens: bool = False
    47. ) -> Union[str, List[str]]:
    48. def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
    49. def set_truncation_and_padding(
    50. self,
    51. padding_strategy: PaddingStrategy,
    52. truncation_strategy: TruncationStrategy,
    53. max_length: int,
    54. stride: int,
    55. pad_to_multiple_of: Optional[int],
    56. ):
    57. def _batch_encode_plus(
    58. self,
    59. batch_text_or_text_pairs: Union[
    60. List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
    61. ],
    62. add_special_tokens: bool = True,
    63. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
    64. truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
    65. max_length: Optional[int] = None,
    66. stride: int = 0,
    67. is_split_into_words: bool = False,
    68. pad_to_multiple_of: Optional[int] = None,
    69. return_tensors: Optional[str] = None,
    70. return_token_type_ids: Optional[bool] = None,
    71. return_attention_mask: Optional[bool] = None,
    72. return_overflowing_tokens: bool = False,
    73. return_special_tokens_mask: bool = False,
    74. return_offsets_mapping: bool = False,
    75. return_length: bool = False,
    76. verbose: bool = True,
    77. ) -> BatchEncoding:
    78. def _encode_plus(
    79. self,
    80. text: Union[TextInput, PreTokenizedInput],
    81. text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
    82. add_special_tokens: bool = True,
    83. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
    84. truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
    85. max_length: Optional[int] = None,
    86. stride: int = 0,
    87. is_split_into_words: bool = False,
    88. pad_to_multiple_of: Optional[int] = None,
    89. return_tensors: Optional[bool] = None,
    90. return_token_type_ids: Optional[bool] = None,
    91. return_attention_mask: Optional[bool] = None,
    92. return_overflowing_tokens: bool = False,
    93. return_special_tokens_mask: bool = False,
    94. return_offsets_mapping: bool = False,
    95. return_length: bool = False,
    96. verbose: bool = True,
    97. **kwargs,
    98. ) -> BatchEncoding:
    99. def convert_tokens_to_string(self, tokens: List[str]) -> str:
    100. def _decode(
    101. self,
    102. token_ids: Union[int, List[int]],
    103. skip_special_tokens: bool = False,
    104. clean_up_tokenization_spaces: bool = None,
    105. **kwargs,
    106. ) -> str:
    107. def _save_pretrained(
    108. self,
    109. save_directory: Union[str, os.PathLike],
    110. file_names: Tuple[str],
    111. legacy_format: Optional[bool] = None,
    112. filename_prefix: Optional[str] = None,
    113. ) -> Tuple[str]:
    114. def train_new_from_iterator(
    115. self,
    116. text_iterator,
    117. vocab_size,
    118. length=None,
    119. new_special_tokens=None,
    120. special_tokens_map=None,
    121. **kwargs,
    122. ):

    流式输出

    官网指导

    1. from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
    2. tok = AutoTokenizer.from_pretrained("gpt2")
    3. model = AutoModelForCausalLM.from_pretrained("gpt2")
    4. inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
    5. streamer = TextStreamer(tok)
    6. # Despite returning the usual output, the streamer will also print the generated text to stdout.
    7. _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
    1. from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
    2. from threading import Thread
    3. tok = AutoTokenizer.from_pretrained("gpt2")
    4. model = AutoModelForCausalLM.from_pretrained("gpt2")
    5. inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
    6. streamer = TextIteratorStreamer(tok)
    7. # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
    8. generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
    9. thread = Thread(target=model.generate, kwargs=generation_kwargs)
    10. thread.start()
    11. generated_text = ""
    12. for new_text in streamer:
    13. generated_text += new_text
    14. print(generated_text)

  • 相关阅读:
    HTTPS - 揭秘 TLS 1.2 协议完整握手过程--此文为转发文,一定要结合wirshark工具看,很清楚
    备战蓝桥杯————k个一组反转单链表
    2022年这一批陕西省工程职称评审难度调整了
    学习 C++ 编程,怎么才能找到合适的练手项目?
    【牛客 - 剑指offer】JZ4 二维数组中的查找 Java实现
    SpringMVC第六阶段:数据在域中的保存(02)
    【UVM 验证平台打印时间单位控制】
    AI时代 编程高手的秘密武器:世界顶级大学推荐的计算机教材
    kafka面试题(基础-进阶-高阶)
    BUUCTF做题Upload-Labs记录pass-11~pass-20
  • 原文地址:https://blog.csdn.net/zhuikefeng/article/details/134282259