1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3
4import os
5import pkgutil
6
7
8class Perluniprops:
9 """
10 This class is used to read lists of characters from the Perl Unicode
11 Properties (see http://perldoc.perl.org/perluniprops.html).
12 The files in the perluniprop.zip are extracted using the Unicode::Tussle
13 module from http://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
14 """
15
16 def __init__(self):
17 self.datadir = (
18 os.path.dirname(os.path.abspath(__file__)) + "/data/perluniprops/"
19 )
20 # These are categories similar to the Perl Unicode Properties
21 self.available_categories = [
22 "Close_Punctuation",
23 "Currency_Symbol",
24 "IsAlnum",
25 "IsAlpha",
26 "IsLower",
27 "IsN",
28 "IsSc",
29 "IsSo",
30 "IsUpper",
31 "Line_Separator",
32 "Number",
33 "Open_Punctuation",
34 "Punctuation",
35 "Separator",
36 "Symbol",
37 "Lowercase_Letter",
38 "Titlecase_Letter",
39 "Uppercase_Letter",
40 "IsPf",
41 "IsPi",
42 "CJKSymbols",
43 "CJK",
44 ]
45
46 def chars(self, category=None):
47 """
48 This module returns a list of characters from the Perl Unicode Properties.
49 They are very useful when porting Perl tokenizers to Python.
50
51 >>> from sacremoses.corpus import Perluniprops
52 >>> pup = Perluniprops()
53 >>> list(pup.chars('Open_Punctuation'))[:5] == ['(', '[', '{', '\u0f3a', '\u0f3c']
54 True
55 >>> list(pup.chars('Currency_Symbol'))[:5] == ['$', '\xa2', '\xa3', '\xa4', '\xa5']
56 True
57 >>> pup.available_categories[:5]
58 ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower']
59
60 :return: a generator of characters given the specific unicode character category
61 """
62 relative_path = os.path.join("data", "perluniprops", category + ".txt")
63 binary_data = pkgutil.get_data("sacremoses", relative_path)
64 for ch in binary_data.decode("utf-8"):
65 yield ch
66
67
68class NonbreakingPrefixes:
69 """
70 This is a class to read the nonbreaking prefixes textfiles from the
71 Moses Machine Translation toolkit. These lists are used in the Python port
72 of the Moses' word tokenizer.
73 """
74
75 def __init__(self):
76 self.datadir = (
77 os.path.dirname(os.path.abspath(__file__)) + "/data/nonbreaking_prefixes/"
78 )
79 self.available_langs = {
80 "assamese": "as",
81 "bengali": "bn",
82 "catalan": "ca",
83 "czech": "cs",
84 "german": "de",
85 "greek": "el",
86 "english": "en",
87 "spanish": "es",
88 "estonian": "et",
89 "finnish": "fi",
90 "french": "fr",
91 "irish": "ga",
92 "gujarati": "gu",
93 "hindi": "hi",
94 "hungarian": "hu",
95 "icelandic": "is",
96 "italian": "it",
97 "kannada": "kn",
98 "lithuanian": "lt",
99 "latvian": "lv",
100 "malayalam": "ml",
101 "manipuri": "mni",
102 "marathi": "mr",
103 "dutch": "nl",
104 "oriya": "or",
105 "punjabi": "pa",
106 "polish": "pl",
107 "portuguese": "pt",
108 "romanian": "ro",
109 "russian": "ru",
110 "slovak": "sk",
111 "slovenian": "sl",
112 "swedish": "sv",
113 "tamil": "ta",
114 "telugu": "te",
115 "tetum": "tdt",
116 "cantonese": "yue",
117 "chinese": "zh",
118 }
119 # Also, add the lang IDs as the keys.
120 self.available_langs.update({v: v for v in self.available_langs.values()})
121
122 def words(self, lang=None, ignore_lines_startswith="#"):
123 """
124 This module returns a list of nonbreaking prefixes for the specified
125 language(s).
126
127 >>> from sacremoses.corpus import NonbreakingPrefixes
128 >>> nbp = NonbreakingPrefixes()
129 >>> list(nbp.words('en'))[:10] == ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
130 True
131 >>> list(nbp.words('ta'))[:5] == ['\u0bb0', '\u0bc2', '\u0ba4\u0bbf\u0bb0\u0bc1', '\u0b8f', '\u0baa\u0bc0']
132 True
133
134 :return: a generator words for the specified language(s).
135 """
136 # If *lang* in list of languages available, allocate apt fileid.
137 if lang in self.available_langs:
138 filenames = ["nonbreaking_prefix." + self.available_langs[lang]]
139 # Use non-breaking prefixes for all languages when lang==None.
140 elif lang == None:
141 filenames = [
142 "nonbreaking_prefix." + v for v in set(self.available_langs.values())
143 ]
144 else:
145 filenames = ["nonbreaking_prefix.en"]
146
147 for filename in filenames:
148 relative_path = os.path.join("data", "nonbreaking_prefixes", filename)
149 binary_data = pkgutil.get_data("sacremoses", relative_path)
150 for line in binary_data.decode("utf-8").splitlines():
151 line = line.strip()
152 if line and not line.startswith(ignore_lines_startswith):
153 yield line
154
155
156__all__ = ["Perluniprops", "NonbreakingPrefixes"]