/src/sentencepiece/src/normalizer.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2016 Google Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License.! |
14 | | |
15 | | #ifndef NORMALIZER_NORMALIZER_H_ |
16 | | #define NORMALIZER_NORMALIZER_H_ |
17 | | |
18 | | #include <memory> |
19 | | #include <set> |
20 | | #include <string> |
21 | | #include <utility> |
22 | | #include <vector> |
23 | | |
24 | | #include "common.h" |
25 | | #include "sentencepiece_model.pb.h" |
26 | | #include "sentencepiece_processor.h" |
27 | | #include "third_party/absl/strings/string_view.h" |
28 | | #include "third_party/darts_clone/darts.h" |
29 | | |
30 | | namespace sentencepiece { |
31 | | namespace normalizer { |
32 | | |
33 | | // Given a list of strings, finds the longest string which is a |
34 | | // prefix of a query. |
35 | | class PrefixMatcher { |
36 | | public: |
37 | | // Initializes the PrefixMatcher with `dic`. |
38 | | explicit PrefixMatcher(const std::set<absl::string_view> &dic); |
39 | | |
40 | | // Finds the longest string in dic, which is a prefix of `w`. |
41 | | // Returns the UTF8 byte length of matched string. |
42 | | // `found` is set if a prefix match exists. |
43 | | // If no entry is found, consumes one Unicode character. |
44 | | int PrefixMatch(absl::string_view w, bool *found = nullptr) const; |
45 | | |
46 | | // Replaces entries in `w` with `out`. |
47 | | std::string GlobalReplace(absl::string_view w, absl::string_view out) const; |
48 | | |
49 | | private: |
50 | | std::unique_ptr<Darts::DoubleArray> trie_; |
51 | | }; |
52 | | |
53 | | // Normalizer implements a simple text normalizer with |
54 | | // user-defined string-to-string rules and leftmost longest |
55 | | // matching. The rules of Normalizer are built with |
56 | | // Builder::CompileCharsMap() method. Pre-compiled rules are |
57 | | // also available via Builder::GetPrecompiledCharsMap(<name>) method. |
58 | | // |
59 | | // The motivation of Normalizer is to make flexible, user-customizable |
60 | | // and self-contained normalizer. All the logic of normalization is |
61 | | // encoded in the model proto which allows us to define language/task |
62 | | // dependent normalization rules without breaking the default rule. |
63 | | class Normalizer { |
64 | | public: |
65 | | // Instantiates Normalizer with |spec|. |
66 | | // |spec| should not be deleted until Normalizer is destroyed. |
67 | | explicit Normalizer(const NormalizerSpec &spec); |
68 | | Normalizer(const NormalizerSpec &spec, const TrainerSpec &trainer_Spec); |
69 | | virtual ~Normalizer(); |
70 | | |
71 | 0 | virtual void SetPrefixMatcher(const PrefixMatcher *matcher) { |
72 | 0 | matcher_ = matcher; |
73 | 0 | } |
74 | | |
75 | | // Returns Status. |
76 | | // Normalizes function is valid only when status is OK. |
77 | 0 | virtual util::Status status() const { return status_; } |
78 | | |
79 | | // Normalizes a plain utf8 string into an internal representation for |
80 | | // Sentencepiece model. |norm_to_orig| stores the byte-alignment from |
81 | | // normalized string to the original input. |
82 | | // This function can do the following normalizations: |
83 | | // - Character normalization. |
84 | | // (NFKC / full-width to half-width conversion etc). |
85 | | // - Adds a prefix space. |
86 | | // - Replaces a space with a meta symbol. |
87 | | // - Removing heading, tailing and other redundant spaces. |
88 | | virtual util::Status Normalize(absl::string_view input, |
89 | | std::string *normalized, |
90 | | std::vector<size_t> *norm_to_orig) const; |
91 | | |
92 | | // Returns a normalized string without alignments. |
93 | | // This function is used in sentencepiece training. |
94 | | virtual std::string Normalize(absl::string_view input) const; |
95 | | |
96 | | friend class Builder; |
97 | | |
98 | | private: |
99 | | FRIEND_TEST(NormalizerTest, EncodeDecodePrecompiledCharsMapTest); |
100 | | |
101 | | void Init(); |
102 | | |
103 | | // Normalizes the prefix of |input| and returns the pair of |
104 | | // normalized prefix and length we must consume after |
105 | | // normalization. |
106 | | // Here's the sample code for the full text normalization. |
107 | | // |
108 | | // string output; |
109 | | // absl::string_view input = "..."; |
110 | | // while (!input.empty()) { |
111 | | // const auto p = normalizer.NormalizePrefix(input); |
112 | | // output.append(p.first.data(), p.first.size()); |
113 | | // input.remove_prefix(p.second); |
114 | | // } |
115 | | std::pair<absl::string_view, int> NormalizePrefix( |
116 | | absl::string_view input) const; |
117 | | |
118 | | // Encodes trie_blob and normalized string and return compiled blob. |
119 | | static std::string EncodePrecompiledCharsMap(absl::string_view trie_blob, |
120 | | absl::string_view normalized); |
121 | | |
122 | | // Decodes blob into trie_blob and normalized string. |
123 | | static util::Status DecodePrecompiledCharsMap(absl::string_view blob, |
124 | | absl::string_view *trie_blob, |
125 | | absl::string_view *normalized, |
126 | | std::string *buffer = nullptr); |
127 | | |
128 | | // Maximum size of the return value of Trie, which corresponds |
129 | | // to the maximum size of shared common prefix in the chars map. |
130 | | static constexpr int kMaxTrieResultsSize = 32; |
131 | | |
132 | | // Internal trie for efficient longest matching. |
133 | | std::unique_ptr<Darts::DoubleArray> trie_; |
134 | | |
135 | | // "\0" delimitered output string. |
136 | | // the value of |trie_| stores pointers to this string. |
137 | | absl::string_view normalized_; |
138 | | |
139 | | // Spec for normalization. |
140 | | const NormalizerSpec *spec_; |
141 | | |
142 | | // Prefix matcher; |
143 | | const PrefixMatcher *matcher_ = nullptr; |
144 | | |
145 | | // Split hello world into "hello_" and "world_" instead of |
146 | | // "_hello" and "_world". |
147 | | const bool treat_whitespace_as_suffix_ = false; |
148 | | |
149 | | #ifdef IS_BIG_ENDIAN |
150 | | // Stores the blob for TRIE encoded in big-endian. |
151 | | std::string precompiled_charsmap_buffer_; |
152 | | #endif |
153 | | |
154 | | // Normalizer's status. |
155 | | util::Status status_; |
156 | | }; |
157 | | } // namespace normalizer |
158 | | } // namespace sentencepiece |
159 | | #endif // NORMALIZER_NORMALIZER_H_ |