/src/sentencepiece/third_party/esaxx/esa.hxx
Line | Count | Source |
1 | | /* |
2 | | * esa.hxx |
3 | | * Copyright (c) 2010 Daisuke Okanohara All Rights Reserved. |
4 | | * |
5 | | * Permission is hereby granted, free of charge, to any person |
6 | | * obtaining a copy of this software and associated documentation |
7 | | * files (the "Software"), to deal in the Software without |
8 | | * restriction, including without limitation the rights to use, |
9 | | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the |
11 | | * Software is furnished to do so, subject to the following |
12 | | * conditions: |
13 | | * |
14 | | * The above copyright notice and this permission notice shall be |
15 | | * included in all copies or substantial portions of the Software. |
16 | | * |
17 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
18 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
19 | | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
20 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
21 | | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
22 | | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
23 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
24 | | * OTHER DEALINGS IN THE SOFTWARE. |
25 | | */ |
26 | | |
27 | | #ifndef _ESA_HXX |
28 | | #define _ESA_HXX |
29 | | |
30 | | #include <vector> |
31 | | #include <utility> |
32 | | #include <cassert> |
33 | | #include "sais.hxx" |
34 | | |
35 | | namespace esaxx_private { |
36 | | template<typename string_type, typename sarray_type, typename index_type> |
37 | 6.80k | index_type suffixtree(string_type T, sarray_type SA, sarray_type L, sarray_type R, sarray_type D, index_type n){ |
38 | 6.80k | if (n == 0){ |
39 | 0 | return 0; |
40 | 0 | } |
41 | 6.80k | sarray_type Psi = L; |
42 | 6.80k | Psi[SA[0]] = SA[n-1]; |
43 | 6.34M | for (index_type i = 1; i < n; ++i){ |
44 | 6.34M | Psi[SA[i]] = SA[i-1]; |
45 | 6.34M | } |
46 | | |
47 | | // Compare at most 2n log n charcters. Practically fastest |
48 | | // "Permuted Longest-Common-Prefix Array", Juha Karkkainen, CPM 09 |
49 | 6.80k | sarray_type PLCP = R; |
50 | 6.80k | index_type h = 0; |
51 | 6.35M | for (index_type i = 0; i < n; ++i){ |
52 | 6.34M | index_type j = Psi[i]; |
53 | 12.5M | while (i+h < n && j+h < n && |
54 | 12.4M | T[i+h] == T[j+h]){ |
55 | 6.15M | ++h; |
56 | 6.15M | } |
57 | 6.34M | PLCP[i] = h; |
58 | 6.34M | if (h > 0) --h; |
59 | 6.34M | } |
60 | | |
61 | 6.80k | sarray_type H = L; |
62 | 6.35M | for (index_type i = 0; i < n; ++i){ |
63 | 6.34M | H[i] = PLCP[SA[i]]; |
64 | 6.34M | } |
65 | 6.80k | H[0] = -1; |
66 | | |
67 | 6.80k | std::vector<std::pair<index_type, index_type> > S; |
68 | 6.80k | S.push_back(std::make_pair((index_type)-1, (index_type)-1)); |
69 | 6.80k | size_t nodeNum = 0; |
70 | 6.35M | for (index_type i = 0; ; ++i){ |
71 | 6.35M | std::pair<index_type, index_type> cur (i, (i == n) ? -1 : H[i]); |
72 | 6.35M | std::pair<index_type, index_type> cand(S.back()); |
73 | 16.4M | while (cand.second > cur.second){ |
74 | 10.1M | if (i - cand.first > 1){ |
75 | 3.79M | L[nodeNum] = cand.first; |
76 | 3.79M | R[nodeNum] = i; |
77 | 3.79M | D[nodeNum] = cand.second; |
78 | 3.79M | ++nodeNum; |
79 | 3.79M | } |
80 | 10.1M | cur.first = cand.first; |
81 | 10.1M | S.pop_back(); |
82 | 10.1M | cand = S.back(); |
83 | 10.1M | } |
84 | 6.35M | if (cand.second < cur.second){ |
85 | 3.79M | S.push_back(cur); |
86 | 3.79M | } |
87 | 6.35M | if (i == n) break; |
88 | 6.34M | S.push_back(std::make_pair(i, n - SA[i] + 1)); |
89 | 6.34M | } |
90 | 6.80k | return nodeNum; |
91 | 6.80k | } Unexecuted instantiation: long esaxx_private::suffixtree<std::__1::__wrap_iter<unsigned int*>, std::__1::__wrap_iter<long*>, long>(std::__1::__wrap_iter<unsigned int*>, std::__1::__wrap_iter<long*>, std::__1::__wrap_iter<long*>, std::__1::__wrap_iter<long*>, std::__1::__wrap_iter<long*>, long) int esaxx_private::suffixtree<std::__1::__wrap_iter<unsigned int*>, std::__1::__wrap_iter<int*>, int>(std::__1::__wrap_iter<unsigned int*>, std::__1::__wrap_iter<int*>, std::__1::__wrap_iter<int*>, std::__1::__wrap_iter<int*>, std::__1::__wrap_iter<int*>, int) Line | Count | Source | 37 | 6.80k | index_type suffixtree(string_type T, sarray_type SA, sarray_type L, sarray_type R, sarray_type D, index_type n){ | 38 | 6.80k | if (n == 0){ | 39 | 0 | return 0; | 40 | 0 | } | 41 | 6.80k | sarray_type Psi = L; | 42 | 6.80k | Psi[SA[0]] = SA[n-1]; | 43 | 6.34M | for (index_type i = 1; i < n; ++i){ | 44 | 6.34M | Psi[SA[i]] = SA[i-1]; | 45 | 6.34M | } | 46 | | | 47 | | // Compare at most 2n log n charcters. Practically fastest | 48 | | // "Permuted Longest-Common-Prefix Array", Juha Karkkainen, CPM 09 | 49 | 6.80k | sarray_type PLCP = R; | 50 | 6.80k | index_type h = 0; | 51 | 6.35M | for (index_type i = 0; i < n; ++i){ | 52 | 6.34M | index_type j = Psi[i]; | 53 | 12.5M | while (i+h < n && j+h < n && | 54 | 12.4M | T[i+h] == T[j+h]){ | 55 | 6.15M | ++h; | 56 | 6.15M | } | 57 | 6.34M | PLCP[i] = h; | 58 | 6.34M | if (h > 0) --h; | 59 | 6.34M | } | 60 | | | 61 | 6.80k | sarray_type H = L; | 62 | 6.35M | for (index_type i = 0; i < n; ++i){ | 63 | 6.34M | H[i] = PLCP[SA[i]]; | 64 | 6.34M | } | 65 | 6.80k | H[0] = -1; | 66 | | | 67 | 6.80k | std::vector<std::pair<index_type, index_type> > S; | 68 | 6.80k | S.push_back(std::make_pair((index_type)-1, (index_type)-1)); | 69 | 6.80k | size_t nodeNum = 0; | 70 | 6.35M | for (index_type i = 0; ; ++i){ | 71 | 6.35M | std::pair<index_type, index_type> cur (i, (i == n) ? -1 : H[i]); | 72 | 6.35M | std::pair<index_type, index_type> cand(S.back()); | 73 | 16.4M | while (cand.second > cur.second){ | 74 | 10.1M | if (i - cand.first > 1){ | 75 | 3.79M | L[nodeNum] = cand.first; | 76 | 3.79M | R[nodeNum] = i; | 77 | 3.79M | D[nodeNum] = cand.second; | 78 | 3.79M | ++nodeNum; | 79 | 3.79M | } | 80 | 10.1M | cur.first = cand.first; | 81 | 10.1M | S.pop_back(); | 82 | 10.1M | cand = S.back(); | 83 | 10.1M | } | 84 | 6.35M | if (cand.second < cur.second){ | 85 | 3.79M | S.push_back(cur); | 86 | 3.79M | } | 87 | 6.35M | if (i == n) break; | 88 | 6.34M | S.push_back(std::make_pair(i, n - SA[i] + 1)); | 89 | 6.34M | } | 90 | 6.80k | return nodeNum; | 91 | 6.80k | } |
|
92 | | } |
93 | | |
94 | | /** |
95 | | * @brief Build an enhanced suffix array of a given string in linear time |
96 | | * For an input text T, esaxx() builds an enhancd suffix array in linear time. |
97 | | * i-th internal node is represented as a triple (L[i], R[i], D[i]); |
98 | | * L[i] and R[i] is the left/right boundary of the suffix array as SA[L[i]....R[i]-1] |
99 | | * D[i] is the depth of the internal node |
100 | | * The number of internal node is at most N-1 and return the actual number by |
101 | | * @param T[0...n-1] The input string. (random access iterator) |
102 | | * @param SA[0...n-1] The output suffix array (random access iterator) |
103 | | * @param L[0...n-1] The output left boundary of internal node (random access iterator) |
104 | | * @param R[0...n-1] The output right boundary of internal node (random access iterator) |
105 | | * @param D[0...n-1] The output depth of internal node (random access iterator) |
106 | | * @param n The length of the input string |
107 | | * @param k The alphabet size |
108 | | * @pram nodeNum The output the number of internal node |
109 | | * @return 0 if succeded, -1 or -2 otherwise |
110 | | */ |
111 | | |
112 | | template<typename string_type, typename sarray_type, typename index_type> |
113 | | int esaxx(string_type T, sarray_type SA, sarray_type L, sarray_type R, sarray_type D, |
114 | 6.80k | index_type n, index_type k, index_type& nodeNum) { |
115 | 6.80k | if ((n < 0) || (k <= 0)) return -1; |
116 | 6.80k | int err = saisxx(T, SA, n, k); |
117 | 6.80k | if (err != 0){ |
118 | 0 | return err; |
119 | 0 | } |
120 | 6.80k | nodeNum = esaxx_private::suffixtree(T, SA, L, R, D, n); |
121 | 6.80k | return 0; |
122 | 6.80k | } Unexecuted instantiation: int esaxx<std::__1::__wrap_iter<unsigned int*>, std::__1::__wrap_iter<long*>, long>(std::__1::__wrap_iter<unsigned int*>, std::__1::__wrap_iter<long*>, std::__1::__wrap_iter<long*>, std::__1::__wrap_iter<long*>, std::__1::__wrap_iter<long*>, long, long, long&) int esaxx<std::__1::__wrap_iter<unsigned int*>, std::__1::__wrap_iter<int*>, int>(std::__1::__wrap_iter<unsigned int*>, std::__1::__wrap_iter<int*>, std::__1::__wrap_iter<int*>, std::__1::__wrap_iter<int*>, std::__1::__wrap_iter<int*>, int, int, int&) Line | Count | Source | 114 | 6.80k | index_type n, index_type k, index_type& nodeNum) { | 115 | 6.80k | if ((n < 0) || (k <= 0)) return -1; | 116 | 6.80k | int err = saisxx(T, SA, n, k); | 117 | 6.80k | if (err != 0){ | 118 | 0 | return err; | 119 | 0 | } | 120 | 6.80k | nodeNum = esaxx_private::suffixtree(T, SA, L, R, D, n); | 121 | 6.80k | return 0; | 122 | 6.80k | } |
|
123 | | |
124 | | |
125 | | #endif // _ESA_HXX |