if sys.version >= '3.8':
import tensorflow.compat.v1 as tf
else:
import tensorflow as tf
python2.7中使用了learn.preprocessing.VocabularyProcessor处理词汇:
tensorflow.contrib.learn模块,tf2.x没有这个模块进行使用,为了兼容有三种方法,
from keras.preprocessing.text import Tokenizer # one-hot编码
from keras.preprocessing import sequence # 数据长度规范化
tokenizer = Tokenizer(num_words=5000, char_level=True, oov_token=’UNK’)
tokenizer.fit_on_texts(texts)
from gensim.models.word2vec import Word2Vec
model = Word2Vec(text,…)
#模型加载
model = Word2Vec.load(model_path)
model.build_vocab(text, update= True) 更新词汇表
model.train(text,total_examples=model.corpus_count, epochs=model.iter)
class VOCAB():
def __init__(self,PAD_LEN=False):
if PAD_LEN:
self.PAD_LEN = PAD_LEN
pass
def restore(self,vocab_path):
with open(vocab_path,"r+") as f:
all_dict = json.load(f)
self.vocabulary_ = all_dict['vocabulary']
self.PAD_LEN = all_dict['pad_length']
return
def fit(self,data,min_count=0):
words = sum([i.split(' ') for i in data],[])
count = Counter(words)
sorted_word_to_cnt = sorted(count.items(),key=itemgetter(1),reverse=True)
sorted_words = ['PAD','UNK']
for word,count in sorted_word_to_cnt:
if count > min_count:
sorted_words.append(word)
word_to_id = {k:v for k,v in zip(sorted_words,range(len(sorted_words)))}
self.vocabulary_ = word_to_id
return word_to_id
def transform(self,data):
data_to_id = []
for line in data:
words = line.split(' ')[:self.PAD_LEN]
words_id = []
for word in words:
words_id.append(self.vocabulary_.get(word,1))
if len(words_id) < self.PAD_LEN:
words_id += [0]*(self.PAD_LEN-len(words_id))
data_to_id.append(words_id)
return data_to_id
def save(self,vocab_path):
all_dict = {}
all_dict['vocabulary'] = self.vocabulary_
all_dict['pad_length'] = self.PAD_LEN
with open(vocab_path,'w') as f:
json.dump(all_dict,f)
initializer=tf.contrib.layers.xavier_initializer())
tf.contrib.layers.l2_regularizer(l2_lambda)
tf2中,contrib这个库被取消了,xavier_initializer函数返回一个用于初始化权重的初始化程序Xavier,这个初始化器是用来保持每一层的梯度大小都差不多相同。
解决方法:
1、 tf.2之后把tf.contrib.layers.xavier_initializer()替换成了tf.keras.initializers.glorot_normal() (Xavier和Glorot是对同一种初始化算法的不同命名方式),使用新的函数替换即可:
initializer=tf.keras.initializers.glorot_normal())
tf.keras.regularizers.l2(l2_lambda)
2、 使用tensorflow2.x的方法tf.initializers.GlorotUniform()进行初始化
initializer = tf.initializers.GlorotUniform(seed=1)
x_input = tf.placeholder(tf.int32, [None, sequence_length], name="x_input")
解决方法:
添加tf.disable_eager_execution()
tensorflow1.x和tensorflow2.x中contrib模块内容集成到下面三个包中: