• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Utilities for text input preprocessing."""
16# pylint: disable=invalid-name
17
18from keras_preprocessing import text
19
20from tensorflow.python.keras.preprocessing.text_dataset import text_dataset_from_directory  # pylint: disable=unused-import
21from tensorflow.python.util.tf_export import keras_export
22
23hashing_trick = text.hashing_trick
24Tokenizer = text.Tokenizer
25
26
27@keras_export('keras.preprocessing.text.text_to_word_sequence')
28def text_to_word_sequence(input_text,
29                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
30                          lower=True,
31                          split=' '):
32  """Converts a text to a sequence of words (or tokens).
33
34  This function transforms a string of text into a list of words
35  while ignoring `filters` which include punctuations by default.
36
37  >>> sample_text = 'This is a sample sentence.'
38  >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
39  ['this', 'is', 'a', 'sample', 'sentence']
40
41  Args:
42      input_text: Input text (string).
43      filters: list (or concatenation) of characters to filter out, such as
44          punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``,
45            includes basic punctuation, tabs, and newlines.
46      lower: boolean. Whether to convert the input to lowercase.
47      split: str. Separator for word splitting.
48
49  Returns:
50      A list of words (or tokens).
51  """
52  return text.text_to_word_sequence(
53      input_text, filters=filters, lower=lower, split=split)
54
55
56@keras_export('keras.preprocessing.text.one_hot')
57def one_hot(input_text,
58            n,
59            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
60            lower=True,
61            split=' '):
62  r"""One-hot encodes a text into a list of word indexes of size `n`.
63
64  This function receives as input a string of text and returns a
65  list of encoded integers each corresponding to a word (or token)
66  in the given input string.
67
68  Args:
69      input_text: Input text (string).
70      n: int. Size of vocabulary.
71      filters: list (or concatenation) of characters to filter out, such as
72        punctuation. Default:
73        ```
74        '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
75        ```,
76        includes basic punctuation, tabs, and newlines.
77      lower: boolean. Whether to set the text to lowercase.
78      split: str. Separator for word splitting.
79
80  Returns:
81      List of integers in `[1, n]`. Each integer encodes a word
82      (unicity non-guaranteed).
83  """
84  return text.one_hot(input_text, n, filters=filters, lower=lower, split=split)
85
86
87# text.tokenizer_from_json is only available if keras_preprocessing >= 1.1.0
88try:
89  tokenizer_from_json = text.tokenizer_from_json
90  keras_export('keras.preprocessing.text.tokenizer_from_json')(
91      tokenizer_from_json)
92except AttributeError:
93  pass
94
95keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
96keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer)
97