/src/sentencepiece/src/pretokenizer_for_training.cc
Line | Count | Source |
1 | | // Copyright 2016 Google Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License.! |
14 | | #include "pretokenizer_for_training.h" |
15 | | |
16 | | #include <string> |
17 | | |
18 | | #include "third_party/absl/strings/str_replace.h" |
19 | | |
20 | | namespace sentencepiece { |
21 | | namespace pretokenizer { |
22 | | |
23 | | namespace { |
24 | | // TODO(taku): They are defined in trainer_interface.h but we |
25 | | // defined them explicitly to avoid the dependency to trainier_interface. |
26 | | // Currently, we have no separated build rules. |
27 | | const char kWSStr[] = "\xe2\x96\x81"; |
28 | | } // namespace |
29 | | |
30 | | std::vector<std::string> PretokenizerForTrainingInterface::PreTokenize( |
31 | 0 | absl::string_view text) const { |
32 | 0 | return Postprocess(Tokenize(Preprocess(text))); |
33 | 0 | } |
34 | | |
35 | | // static |
36 | | std::string PretokenizerForTrainingInterface::Preprocess( |
37 | 0 | absl::string_view text) { |
38 | | // Escapes kWSStr (_) as this character may not be processed by pre-tokenizer. |
39 | 0 | return absl::StrReplaceAll(text, {{kWSStr, " "}}); |
40 | 0 | } |
41 | | |
42 | | // static |
43 | | std::vector<std::string> PretokenizerForTrainingInterface::Postprocess( |
44 | 0 | const SentencePieceText &spt) { |
45 | | // Inserts kUPPBoundaryStr before/after of token boundaries. |
46 | 0 | std::vector<std::string> result; |
47 | 0 | std::string output; |
48 | |
|
49 | 0 | uint32_t prev = 0; |
50 | 0 | for (const auto &piece : spt.pieces()) { |
51 | 0 | if (prev == piece.begin() && piece.begin() != 0) { |
52 | 0 | result.push_back(output); |
53 | 0 | output.clear(); |
54 | 0 | } else { |
55 | 0 | output.append(piece.begin() - prev, ' '); |
56 | 0 | } |
57 | 0 | output += piece.surface(); |
58 | 0 | prev = piece.end(); |
59 | 0 | } |
60 | |
|
61 | 0 | if (!output.empty()) result.push_back(output); |
62 | |
|
63 | 0 | for (auto &w : result) w = absl::StrReplaceAll(w, {{" ", kWSStr}}); |
64 | |
|
65 | 0 | return result; |
66 | 0 | } |
67 | | |
68 | | } // namespace pretokenizer |
69 | | } // namespace sentencepiece |