Converting between representations
- from __future__ import absolute_import, division, print_function, unicode_literals
- try:
- # %tensorflow_version only exists in Colab.
- %tensorflow_version 2.x
- except Exception:
- pass
- import tensorflow as tf
基本的TensorFlow tf.string.dtype允许您构建字节字符串的张量。Unicode字符串默认编码为utf-8。
tf.constant(u"Thanks ?")
<tf.Tensor: id=0, shape=(), dtype=string, numpy=b'Thanks \xf0\x9f\x98\x8a'>
tf.constant([u"You're", u"welcome!"]).shape
注意:当使用python构造字符串时,unicode的处理方式不同于betweeen v2和v3。在v2中,unicode字符串由“u”前缀表示,如上所示。在v3中,默认情况下字符串是unicode编码的。
- # Unicode string, represented as a UTF-8 encoded string scalar.
- text_utf8 = tf.constant(u"语言处理")
- text_utf8
<tf.Tensor: id=3, shape=(), dtype=string, numpy=b'\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>
- # Unicode string, represented as a UTF-16-BE encoded string scalar.
- text_utf16be = tf.constant(u"语言处理".encode("UTF-16-BE"))
- text_utf16be
<tf.Tensor: id=5, shape=(), dtype=string, numpy=b'\x8b\xed\x8a\x00Y\x04t\x06'>
- # Unicode string, represented as a vector of Unicode code points.
- text_chars = tf.constant([ord(char) for char in u"语言处理"])
- text_chars
<tf.Tensor: id=7, shape=(4,), dtype=int32, numpy=array([35821, 35328, 22788, 29702], dtype=int32)>
- tf.strings.unicode_decode(text_utf8,
- input_encoding='UTF-8')
<tf.Tensor: id=12, shape=(4,), dtype=int32, numpy=array([35821, 35328, 22788, 29702], dtype=int32)>
- tf.strings.unicode_encode(text_chars,
- output_encoding='UTF-8')
<tf.Tensor: id=23, shape=(), dtype=string, numpy=b'\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>
- tf.strings.unicode_transcode(text_utf8,
- input_encoding='UTF8',
- output_encoding='UTF-16-BE')
<tf.Tensor: id=25, shape=(), dtype=string, numpy=b'\x8b\xed\x8a\x00Y\x04t\x06'>
- # A batch of Unicode strings, each represented as a UTF8-encoded string.
- batch_utf8 = [s.encode('UTF-8') for s in
- [u'hÃllo', u'What is the weather tomorrow', u'Göödnight', u'?']]
- batch_chars_ragged = tf.strings.unicode_decode(batch_utf8,
- input_encoding='UTF-8')
- for sentence_chars in batch_chars_ragged.to_list():
- print(sentence_chars)
- [104, 195, 108, 108, 111]
- [87, 104, 97, 116, 32, 105, 115, 32, 116, 104, 101, 32, 119, 101, 97, 116, 104, 101, 114, 32, 116, 111, 109, 111, 114, 114, 111, 119]
- [71, 246, 246, 100, 110, 105, 103, 104, 116]
- [128522]
- batch_chars_padded = batch_chars_ragged.to_tensor(default_value=-1)
- print(batch_chars_padded.numpy())
- WARNING: Logging before flag parsing goes to stderr.
- W0813 08:53:58.604015 139985772394240 deprecation.py:323] From /tmpfs/src/tf_docs_env/lib/python3.5/site-packages/tensorflow/python/ops/ragged/ragged_tensor.py:1553: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
- Instructions for updating:
- Use tf.where in 2.0, which has the same broadcast rule as np.where
- [[ 104 195 108 108 111 -1 -1 -1 -1 -1
- -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
- -1 -1 -1 -1 -1 -1 -1 -1]
- [ 87 104 97 116 32 105 115 32 116 104
- 101 32 119 101 97 116 104 101 114 32
- 116 111 109 111 114 114 111 119]
- [ 71 246 246 100 110 105 103 104 116 -1
- -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
- -1 -1 -1 -1 -1 -1 -1 -1]
- [128522 -1 -1 -1 -1 -1 -1 -1 -1 -1
- -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
- -1 -1 -1 -1 -1 -1 -1 -1]]
batch_chars_sparse = batch_chars_ragged.to_sparse()
- tf.strings.unicode_encode([[99, 97, 116], [100, 111, 103], [ 99, 111, 119]],
- output_encoding='UTF-8')
<tf.Tensor: id=129, shape=(3,), dtype=string, numpy=array([b'cat', b'dog', b'cow'], dtype=object)>
tf.strings.unicode_encode(batch_chars_ragged, output_encoding='UTF-8')
- <tf.Tensor: id=131, shape=(4,), dtype=string, numpy=
- array([b'h\xc3\x83llo', b'What is the weather tomorrow',
- b'G\xc3\xb6\xc3\xb6dnight', b'\xf0\x9f\x98\x8a'], dtype=object)>
- tf.strings.unicode_encode(
- tf.RaggedTensor.from_sparse(batch_chars_sparse),
- output_encoding='UTF-8')
- <tf.Tensor: id=214, shape=(4,), dtype=string, numpy=
- array([b'h\xc3\x83llo', b'What is the weather tomorrow',
- b'G\xc3\xb6\xc3\xb6dnight', b'\xf0\x9f\x98\x8a'], dtype=object)>
- tf.strings.unicode_encode(
- tf.RaggedTensor.from_tensor(batch_chars_padded, padding=-1),
- output_encoding='UTF-8')
- <tf.Tensor: id=289, shape=(4,), dtype=string, numpy=
- array([b'h\xc3\x83llo', b'What is the weather tomorrow',
- b'G\xc3\xb6\xc3\xb6dnight', b'\xf0\x9f\x98\x8a'], dtype=object)>
- # Note that the final character takes up 4 bytes in UTF8.
- thanks = u'Thanks ?'.encode('UTF-8')
- num_bytes = tf.strings.length(thanks).numpy()
- num_chars = tf.strings.length(thanks, unit='UTF8_CHAR').numpy()
- print('{} bytes; {} UTF-8 characters'.format(num_bytes, num_chars))
11 bytes; 8 UTF-8 characters
- # default: unit='BYTE'. With len=1, we return a single byte.
- tf.strings.substr(thanks, pos=7, len=1).numpy()
- # Specifying unit='UTF8_CHAR', we return a single character, which in this case
- # is 4 bytes.
- print(tf.strings.substr(thanks, pos=7, len=1, unit='UTF8_CHAR').numpy())
tf.strings.unicode_split(thanks, 'UTF-8').numpy()
- array([b'T', b'h', b'a', b'n', b'k', b's', b' ', b'\xf0\x9f\x98\x8a'],
- dtype=object)
- codepoints, offsets = tf.strings.unicode_decode_with_offsets(u"???", 'UTF-8')
- for (codepoint, offset) in zip(codepoints.numpy(), offsets.numpy()):
- print("At byte offset {}: codepoint {}".format(offset, codepoint))
- At byte offset 0: codepoint 127880
- At byte offset 4: codepoint 127881
- At byte offset 8: codepoint 127882
每个Unicode代码点都属于一个称为脚本的代码点集合。角色的脚本有助于确定角色可能使用的语言。例如,知道“Б”是在斯拉夫字母表明现代文本包含字符可能来自俄罗斯或乌克兰等斯拉夫语言。TensorFlow提供tf.string。确定给定代码点使用哪个脚本的unicode_script操作。脚本代码是与Unicode (ICU) UScriptCode值的国际组件对应的int32值。
- uscript = tf.strings.unicode_script([33464, 1041]) # ['芸', 'Б']
- print(uscript.numpy()) # [17, 8] == [USCRIPT_HAN, USCRIPT_CYRILLIC]
[17 8]
tf.strings.unicode_script操作也可以应用于多维tf.Tensor.RaggedTensors codepoints:
<tf.RaggedTensor [[25, 25, 25, 25, 25], [25, 25, 25, 25, 0, 25, 25, 0, 25, 25, 25, 0, 25, 25, 25, 25, 25, 25, 25, 0, 25, 25, 25, 25, 25, 25, 25, 25], [25, 25, 25, 25, 25, 25, 25, 25, 25], [0]]>
- # dtype: string; shape: [num_sentences]
- #
- # The sentences to process. Edit this line to try out different inputs!
- sentence_texts = [u'Hello, world.', u'世界こんにちは']
- # dtype: int32; shape: [num_sentences, (num_chars_per_sentence)]
- #
- # sentence_char_codepoint[i, j] is the codepoint for the j'th character in
- # the i'th sentence.
- sentence_char_codepoint = tf.strings.unicode_decode(sentence_texts, 'UTF-8')
- print(sentence_char_codepoint)
- # dtype: int32; shape: [num_sentences, (num_chars_per_sentence)]
- #
- # sentence_char_scripts[i, j] is the unicode script of the j'th character in
- # the i'th sentence.
- sentence_char_script = tf.strings.unicode_script(sentence_char_codepoint)
- print(sentence_char_script)
- <tf.RaggedTensor [[72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 46], [19990, 30028, 12371, 12435, 12395, 12385, 12399]]>
- <tf.RaggedTensor [[25, 25, 25, 25, 25, 0, 0, 25, 25, 25, 25, 25, 0], [17, 17, 20, 20, 20, 20, 20]]>
- # dtype: bool; shape: [num_sentences, (num_chars_per_sentence)]
- #
- # sentence_char_starts_word[i, j] is True if the j'th character in the i'th
- # sentence is the start of a word.
- sentence_char_starts_word = tf.concat(
- [tf.fill([sentence_char_script.nrows(), 1], True),
- tf.not_equal(sentence_char_script[:, 1:], sentence_char_script[:, :-1])],
- axis=1)
- # dtype: int64; shape: [num_words]
- #
- # word_starts[i] is the index of the character that starts the i'th word (in
- # the flattened list of characters from all sentences).
- word_starts = tf.squeeze(tf.where(sentence_char_starts_word.values), axis=1)
- print(word_starts)
tf.Tensor([ 0 5 7 12 13 15], shape=(6,), dtype=int64)
- # dtype: int32; shape: [num_words, (num_chars_per_word)]
- #
- # word_char_codepoint[i, j] is the codepoint for the j'th character in the
- # i'th word.
- word_char_codepoint = tf.RaggedTensor.from_row_starts(
- values=sentence_char_codepoint.values,
- row_starts=word_starts)
- print(word_char_codepoint)
<tf.RaggedTensor [[72, 101, 108, 108, 111], [44, 32], [119, 111, 114, 108, 100], [46], [19990, 30028], [12371, 12435, 12395, 12385, 12399]]>
最后,我们可以把单词codepoints ragged张量分割成句子:
- # dtype: int64; shape: [num_sentences]
- #
- # sentence_num_words[i] is the number of words in the i'th sentence.
- sentence_num_words = tf.reduce_sum(
- tf.cast(sentence_char_starts_word, tf.int64),
- axis=1)
- # dtype: int32; shape: [num_sentences, (num_words_per_sentence), (num_chars_per_word)]
- #
- # sentence_word_char_codepoint[i, j, k] is the codepoint for the k'th character
- # in the j'th word in the i'th sentence.
- sentence_word_char_codepoint = tf.RaggedTensor.from_row_lengths(
- values=word_char_codepoint,
- row_lengths=sentence_num_words)
- print(sentence_word_char_codepoint)
<tf.RaggedTensor [[[72, 101, 108, 108, 111], [44, 32], [119, 111, 114, 108, 100], [46]], [[19990, 30028], [12371, 12435, 12395, 12385, 12399]]]>
tf.strings.unicode_encode(sentence_word_char_codepoint, 'UTF-8').to_list()
- [[b'Hello', b', ', b'world', b'.'],
- [b'\xe4\xb8\x96\xe7\x95\x8c',
- b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf']]