Coverage Report

Created: 2026-06-09 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/sentencepiece/src/pretokenizer_for_training.cc
Line
Count
Source
1
// Copyright 2016 Google Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.!
14
#include "pretokenizer_for_training.h"
15
16
#include <string>
17
18
#include "third_party/absl/strings/str_replace.h"
19
20
namespace sentencepiece {
21
namespace pretokenizer {
22
23
namespace {
24
// TODO(taku): They are defined in trainer_interface.h but we
25
// defined them explicitly to avoid the dependency to trainier_interface.
26
// Currently, we have no separated build rules.
27
const char kWSStr[] = "\xe2\x96\x81";
28
}  // namespace
29
30
std::vector<std::string> PretokenizerForTrainingInterface::PreTokenize(
31
0
    absl::string_view text) const {
32
0
  return Postprocess(Tokenize(Preprocess(text)));
33
0
}
34
35
// static
36
std::string PretokenizerForTrainingInterface::Preprocess(
37
0
    absl::string_view text) {
38
  // Escapes kWSStr (_) as this character may not be processed by pre-tokenizer.
39
0
  return absl::StrReplaceAll(text, {{kWSStr, " "}});
40
0
}
41
42
// static
43
std::vector<std::string> PretokenizerForTrainingInterface::Postprocess(
44
0
    const SentencePieceText &spt) {
45
  // Inserts kUPPBoundaryStr before/after of token boundaries.
46
0
  std::vector<std::string> result;
47
0
  std::string output;
48
49
0
  uint32_t prev = 0;
50
0
  for (const auto &piece : spt.pieces()) {
51
0
    if (prev == piece.begin() && piece.begin() != 0) {
52
0
      result.push_back(output);
53
0
      output.clear();
54
0
    } else {
55
0
      output.append(piece.begin() - prev, ' ');
56
0
    }
57
0
    output += piece.surface();
58
0
    prev = piece.end();
59
0
  }
60
61
0
  if (!output.empty()) result.push_back(output);
62
63
0
  for (auto &w : result) w = absl::StrReplaceAll(w, {{" ", kWSStr}});
64
65
0
  return result;
66
0
}
67
68
}  // namespace pretokenizer
69
}  // namespace sentencepiece