1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Utilities for text input preprocessing.""" 16# pylint: disable=invalid-name 17 18from keras_preprocessing import text 19 20from tensorflow.python.keras.preprocessing.text_dataset import text_dataset_from_directory # pylint: disable=unused-import 21from tensorflow.python.util.tf_export import keras_export 22 23hashing_trick = text.hashing_trick 24Tokenizer = text.Tokenizer 25 26 27@keras_export('keras.preprocessing.text.text_to_word_sequence') 28def text_to_word_sequence(input_text, 29 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 30 lower=True, 31 split=' '): 32 """Converts a text to a sequence of words (or tokens). 33 34 This function transforms a string of text into a list of words 35 while ignoring `filters` which include punctuations by default. 36 37 >>> sample_text = 'This is a sample sentence.' 38 >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text) 39 ['this', 'is', 'a', 'sample', 'sentence'] 40 41 Args: 42 input_text: Input text (string). 43 filters: list (or concatenation) of characters to filter out, such as 44 punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``, 45 includes basic punctuation, tabs, and newlines. 46 lower: boolean. Whether to convert the input to lowercase. 47 split: str. Separator for word splitting. 48 49 Returns: 50 A list of words (or tokens). 51 """ 52 return text.text_to_word_sequence( 53 input_text, filters=filters, lower=lower, split=split) 54 55 56@keras_export('keras.preprocessing.text.one_hot') 57def one_hot(input_text, 58 n, 59 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 60 lower=True, 61 split=' '): 62 r"""One-hot encodes a text into a list of word indexes of size `n`. 63 64 This function receives as input a string of text and returns a 65 list of encoded integers each corresponding to a word (or token) 66 in the given input string. 67 68 Args: 69 input_text: Input text (string). 70 n: int. Size of vocabulary. 71 filters: list (or concatenation) of characters to filter out, such as 72 punctuation. Default: 73 ``` 74 '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n 75 ```, 76 includes basic punctuation, tabs, and newlines. 77 lower: boolean. Whether to set the text to lowercase. 78 split: str. Separator for word splitting. 79 80 Returns: 81 List of integers in `[1, n]`. Each integer encodes a word 82 (unicity non-guaranteed). 83 """ 84 return text.one_hot(input_text, n, filters=filters, lower=lower, split=split) 85 86 87# text.tokenizer_from_json is only available if keras_preprocessing >= 1.1.0 88try: 89 tokenizer_from_json = text.tokenizer_from_json 90 keras_export('keras.preprocessing.text.tokenizer_from_json')( 91 tokenizer_from_json) 92except AttributeError: 93 pass 94 95keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick) 96keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer) 97