/rust/registry/src/index.crates.io-1949cf8c6b5b557f/rust-stemmers-1.2.0/src/lib.rs
Line | Count | Source |
1 | | //! This library provides rust implementations for some stemmer algorithms |
2 | | //! written in the [snowball language](https://snowballstem.org/). |
3 | | //! |
4 | | //! |
5 | | //! All algorithms expect the input to already be lowercased. |
6 | | //! |
7 | | //! # Usage |
8 | | //! ```toml |
9 | | //! [dependencies] |
10 | | //! rust-stemmers = "^1.0" |
11 | | //! ``` |
12 | | //! |
13 | | //! ```rust |
14 | | //! extern crate rust_stemmers; |
15 | | //! |
16 | | //! use rust_stemmers::{Algorithm, Stemmer}; |
17 | | //! |
18 | | //! fn main() { |
19 | | //! let en_stemmer = Stemmer::create(Algorithm::English); |
20 | | //! assert_eq!(en_stemmer.stem("fruitlessly"), "fruitless"); |
21 | | //! } |
22 | | //! ``` |
23 | | extern crate serde; |
24 | | #[macro_use] |
25 | | extern crate serde_derive; |
26 | | |
27 | | use std::borrow::Cow; |
28 | | |
29 | | mod snowball; |
30 | | |
31 | | use snowball::SnowballEnv; |
32 | | use snowball::algorithms; |
33 | | |
34 | | /// Enum of all supported algorithms. |
35 | | /// Check the [Snowball-Website](https://snowballstem.org/) for details. |
36 | | #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)] |
37 | | pub enum Algorithm { |
38 | | Arabic, |
39 | | Danish, |
40 | | Dutch, |
41 | | English, |
42 | | Finnish, |
43 | | French, |
44 | | German, |
45 | | Greek, |
46 | | Hungarian, |
47 | | Italian, |
48 | | Norwegian, |
49 | | Portuguese, |
50 | | Romanian, |
51 | | Russian, |
52 | | Spanish, |
53 | | Swedish, |
54 | | Tamil, |
55 | | Turkish |
56 | | } |
57 | | |
58 | | /// Wrapps a usable interface around the actual stemmer implementation |
59 | | pub struct Stemmer { |
60 | | stemmer: fn(&mut SnowballEnv) -> bool, |
61 | | } |
62 | | |
63 | | impl Stemmer { |
64 | | /// Create a new stemmer from an algorithm |
65 | 0 | pub fn create(lang: Algorithm) -> Self { |
66 | 0 | match lang { |
67 | 0 | Algorithm::Arabic => Stemmer { stemmer: algorithms::arabic::stem }, |
68 | 0 | Algorithm::Danish => Stemmer { stemmer: algorithms::danish::stem }, |
69 | 0 | Algorithm::Dutch => Stemmer { stemmer: algorithms::dutch::stem }, |
70 | 0 | Algorithm::English => Stemmer { stemmer: algorithms::english::stem }, |
71 | 0 | Algorithm::Finnish => Stemmer { stemmer: algorithms::finnish::stem }, |
72 | 0 | Algorithm::French => Stemmer { stemmer: algorithms::french::stem }, |
73 | 0 | Algorithm::German => Stemmer { stemmer: algorithms::german::stem }, |
74 | 0 | Algorithm::Greek => Stemmer { stemmer: algorithms::greek::stem }, |
75 | 0 | Algorithm::Hungarian => Stemmer { stemmer: algorithms::hungarian::stem }, |
76 | 0 | Algorithm::Italian => Stemmer { stemmer: algorithms::italian::stem }, |
77 | 0 | Algorithm::Norwegian => Stemmer { stemmer: algorithms::norwegian::stem }, |
78 | 0 | Algorithm::Portuguese => Stemmer { stemmer: algorithms::portuguese::stem }, |
79 | 0 | Algorithm::Romanian => Stemmer { stemmer: algorithms::romanian::stem }, |
80 | 0 | Algorithm::Russian => Stemmer { stemmer: algorithms::russian::stem }, |
81 | 0 | Algorithm::Spanish => Stemmer { stemmer: algorithms::spanish::stem }, |
82 | 0 | Algorithm::Swedish => Stemmer { stemmer: algorithms::swedish::stem }, |
83 | 0 | Algorithm::Tamil => Stemmer { stemmer: algorithms::tamil::stem }, |
84 | 0 | Algorithm::Turkish => Stemmer { stemmer: algorithms::turkish::stem }, |
85 | | } |
86 | 0 | } |
87 | | |
88 | | /// Stem a single word |
89 | | /// Please note, that the input is expected to be all lowercase (if that is applicable). |
90 | 0 | pub fn stem<'a>(&self, input: &'a str) -> Cow<'a, str> { |
91 | 0 | let mut env = SnowballEnv::create(input); |
92 | 0 | (self.stemmer)(&mut env); |
93 | 0 | env.get_current() |
94 | 0 | } |
95 | | } |
96 | | |
97 | | |
98 | | |
99 | | #[cfg(test)] |
100 | | mod tests { |
101 | | use super::{Stemmer, Algorithm}; |
102 | | |
103 | | fn stemms_to(lhs: &str, rhs: &str, stemmer: Algorithm) { |
104 | | assert_eq!(Stemmer::create(stemmer).stem(lhs), rhs); |
105 | | } |
106 | | |
107 | | #[test] |
108 | | fn german_test() { |
109 | | use std::fs; |
110 | | use std::io; |
111 | | use std::io::BufRead; |
112 | | |
113 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_ger.txt").unwrap()); |
114 | | let result = io::BufReader::new(fs::File::open("test_data/res_ger.txt").unwrap()); |
115 | | |
116 | | let lines = vocab.lines().zip(result.lines()); |
117 | | |
118 | | for (voc, res) in lines { |
119 | | stemms_to(voc.unwrap().as_str(), |
120 | | res.unwrap().as_str(), |
121 | | Algorithm::German); |
122 | | } |
123 | | } |
124 | | |
125 | | #[test] |
126 | | fn english_test() { |
127 | | use std::fs; |
128 | | use std::io; |
129 | | use std::io::BufRead; |
130 | | |
131 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_en.txt").unwrap()); |
132 | | let result = io::BufReader::new(fs::File::open("test_data/res_en.txt").unwrap()); |
133 | | |
134 | | let lines = vocab.lines().zip(result.lines()); |
135 | | |
136 | | for (voc, res) in lines { |
137 | | stemms_to(voc.unwrap().as_str(), |
138 | | res.unwrap().as_str(), |
139 | | Algorithm::English); |
140 | | } |
141 | | } |
142 | | |
143 | | #[test] |
144 | | fn french_test() { |
145 | | use std::fs; |
146 | | use std::io; |
147 | | use std::io::BufRead; |
148 | | |
149 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_fr.txt").unwrap()); |
150 | | let result = io::BufReader::new(fs::File::open("test_data/res_fr.txt").unwrap()); |
151 | | |
152 | | let lines = vocab.lines().zip(result.lines()); |
153 | | |
154 | | for (voc, res) in lines { |
155 | | stemms_to(voc.unwrap().as_str(), |
156 | | res.unwrap().as_str(), |
157 | | Algorithm::French); |
158 | | } |
159 | | } |
160 | | |
161 | | #[test] |
162 | | fn spanish_test() { |
163 | | use std::fs; |
164 | | use std::io; |
165 | | use std::io::BufRead; |
166 | | |
167 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_es.txt").unwrap()); |
168 | | let result = io::BufReader::new(fs::File::open("test_data/res_es.txt").unwrap()); |
169 | | |
170 | | let lines = vocab.lines().zip(result.lines()); |
171 | | |
172 | | for (voc, res) in lines { |
173 | | stemms_to(voc.unwrap().as_str(), |
174 | | res.unwrap().as_str(), |
175 | | Algorithm::Spanish); |
176 | | } |
177 | | } |
178 | | |
179 | | #[test] |
180 | | fn portuguese_test() { |
181 | | use std::fs; |
182 | | use std::io; |
183 | | use std::io::BufRead; |
184 | | |
185 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_pt.txt").unwrap()); |
186 | | let result = io::BufReader::new(fs::File::open("test_data/res_pt.txt").unwrap()); |
187 | | |
188 | | let lines = vocab.lines().zip(result.lines()); |
189 | | |
190 | | for (voc, res) in lines { |
191 | | stemms_to(voc.unwrap().as_str(), |
192 | | res.unwrap().as_str(), |
193 | | Algorithm::Portuguese); |
194 | | } |
195 | | } |
196 | | |
197 | | #[test] |
198 | | fn italian_test() { |
199 | | use std::fs; |
200 | | use std::io; |
201 | | use std::io::BufRead; |
202 | | |
203 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_it.txt").unwrap()); |
204 | | let result = io::BufReader::new(fs::File::open("test_data/res_it.txt").unwrap()); |
205 | | |
206 | | let lines = vocab.lines().zip(result.lines()); |
207 | | |
208 | | for (voc, res) in lines { |
209 | | stemms_to(voc.unwrap().as_str(), |
210 | | res.unwrap().as_str(), |
211 | | Algorithm::Italian); |
212 | | } |
213 | | } |
214 | | |
215 | | #[test] |
216 | | fn romanian_test() { |
217 | | use std::fs; |
218 | | use std::io; |
219 | | use std::io::BufRead; |
220 | | |
221 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_ro.txt").unwrap()); |
222 | | let result = io::BufReader::new(fs::File::open("test_data/res_ro.txt").unwrap()); |
223 | | |
224 | | let lines = vocab.lines().zip(result.lines()); |
225 | | |
226 | | for (voc, res) in lines { |
227 | | stemms_to(voc.unwrap().as_str(), |
228 | | res.unwrap().as_str(), |
229 | | Algorithm::Romanian); |
230 | | } |
231 | | } |
232 | | |
233 | | #[test] |
234 | | fn russian_test() { |
235 | | use std::fs; |
236 | | use std::io; |
237 | | use std::io::BufRead; |
238 | | |
239 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_ru.txt").unwrap()); |
240 | | let result = io::BufReader::new(fs::File::open("test_data/res_ru.txt").unwrap()); |
241 | | |
242 | | let lines = vocab.lines().zip(result.lines()); |
243 | | |
244 | | for (voc, res) in lines { |
245 | | stemms_to(voc.unwrap().as_str(), |
246 | | res.unwrap().as_str(), |
247 | | Algorithm::Russian); |
248 | | } |
249 | | } |
250 | | |
251 | | #[test] |
252 | | fn arabic_test() { |
253 | | use std::fs; |
254 | | use std::io; |
255 | | use std::io::BufRead; |
256 | | |
257 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_ar.txt").unwrap()); |
258 | | let result = io::BufReader::new(fs::File::open("test_data/res_ar.txt").unwrap()); |
259 | | |
260 | | let lines = vocab.lines().zip(result.lines()); |
261 | | |
262 | | for (voc, res) in lines { |
263 | | stemms_to(voc.unwrap().as_str(), |
264 | | res.unwrap().as_str(), |
265 | | Algorithm::Arabic); |
266 | | } |
267 | | } |
268 | | |
269 | | #[test] |
270 | | fn finnish_test() { |
271 | | use std::fs; |
272 | | use std::io; |
273 | | use std::io::BufRead; |
274 | | |
275 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_fi.txt").unwrap()); |
276 | | let result = io::BufReader::new(fs::File::open("test_data/res_fi.txt").unwrap()); |
277 | | |
278 | | let lines = vocab.lines().zip(result.lines()); |
279 | | |
280 | | for (voc, res) in lines { |
281 | | stemms_to(voc.unwrap().as_str(), |
282 | | res.unwrap().as_str(), |
283 | | Algorithm::Finnish); |
284 | | } |
285 | | } |
286 | | |
287 | | #[test] |
288 | | fn greek_test() { |
289 | | use std::fs; |
290 | | use std::io; |
291 | | use std::io::BufRead; |
292 | | |
293 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_el.txt").unwrap()); |
294 | | let result = io::BufReader::new(fs::File::open("test_data/res_el.txt").unwrap()); |
295 | | |
296 | | let lines = vocab.lines().zip(result.lines()); |
297 | | |
298 | | for (voc, res) in lines { |
299 | | stemms_to(voc.unwrap().as_str(), |
300 | | res.unwrap().as_str(), |
301 | | Algorithm::Greek); |
302 | | } |
303 | | } |
304 | | |
305 | | #[test] |
306 | | fn norwegian_test() { |
307 | | use std::fs; |
308 | | use std::io; |
309 | | use std::io::BufRead; |
310 | | |
311 | | let vocab = io::BufReader::new(fs::File::open("test_data/voc_no.txt").unwrap()); |
312 | | let result = io::BufReader::new(fs::File::open("test_data/res_no.txt").unwrap()); |
313 | | |
314 | | let lines = vocab.lines().zip(result.lines()); |
315 | | |
316 | | for (voc, res) in lines { |
317 | | stemms_to(voc.unwrap().as_str(), |
318 | | res.unwrap().as_str(), |
319 | | Algorithm::Norwegian); |
320 | | } |
321 | | } |
322 | | |
323 | | } |