1.加载预训练模型的词表和tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
pretrained_model_name_or_path='E:/bert-base-chinese',
cache_dir=None,
force_download=False,
)
sents = [
'选择珠江花园的原因就是方便。',
'笔记本的键盘确实爽。',
'房间太小。其它都一般。',
]
2.使用预训练语言模型来编码句子
out = tokenizer.encode(
text=sents[0],
text_pair=sents[1],
truncation=True,
padding='max_length',
add_special_tokens=True,
max_length=30,
return_tensors=None,
)
print(tokenizer.decode(out))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
3.批量编码句子
out = tokenizer.batch_encode_plus(
batch_text_or_text_pairs=[sents[0],sents[1]],
add_special_tokens=True,
truncation=True,
padding='max_length',
max_length=15,
return_tensors=None,
return_token_type_ids=True,
return_attention_mask=True,
return_special_tokens_mask=True,
return_length=True,
)
for k,v in out.items():
print(k, ':', v)
print(tokenizer.decode(out['input_ids'][0]))
print(tokenizer.decode(out['input_ids'][1]))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
4.预训练语言模型的词表操作:增加新词、增加新符号
zidian = tokenizer.get_vocab()
print('月光' in zidian)
tokenizer.add_tokens(new_tokens=['月光'])
tokenizer.add_special_tokens({'eos_token': '[EOS'})
zidian = tokenizer.get_vocab()
print('月光' in zidian)
5.dataset的API
from datasets import load_dataset
csv_dataset = load_dataset(
path='csv',
data_files='./data/ChnSentiCorp.csv',
split='train'
)
print(csv_dataset[2])
json_dataset = load_dataset(
path='json',
data_files='./data/ChnSentiCorp.json',
split='train'
)
print(json_dataset[2])
from datasets import load_from_disk
dataset = load_from_disk('./data/ChnSentiCorp/train')
print(dataset)
sorted_dataset = dataset.sort('label')
print(sorted_dataset['label'][:10])
print(sorted_dataset['label'][-10:])
shffled_dataset = sorted_dataset.shuffle(seed=42)
print(shffled_dataset['label'][:10])
print(dataset.select([0,10,20,30,40,50]))
def f(data):
return data['text'].startswith('选择')
start_with_ar = dataset.filter(f)
print(start_with_ar['text'])
dataset.train_test_split(test_size=0.1)
dataset.shard(num_shards=4, index=0)
dataset.rename_column('text','textA')
dataset.remove_columns(['text'])
dataset.set_format(type='torch', columns=['label'])
def f(data):
data['text'] = 'My sentence:' + data['text']
dataset_map = dataset.map(f)
dataset_map['text'][:5]
dataset.save('路径')

- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73