Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/corpus.py: 94%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

31 statements  

1#!/usr/bin/env python3 

2# -*- coding: utf-8 -*- 

3 

4import os 

5import pkgutil 

6 

7 

8class Perluniprops: 

9 """ 

10 This class is used to read lists of characters from the Perl Unicode 

11 Properties (see http://perldoc.perl.org/perluniprops.html). 

12 The files in the perluniprop.zip are extracted using the Unicode::Tussle 

13 module from http://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm 

14 """ 

15 

16 def __init__(self): 

17 self.datadir = ( 

18 os.path.dirname(os.path.abspath(__file__)) + "/data/perluniprops/" 

19 ) 

20 # These are categories similar to the Perl Unicode Properties 

21 self.available_categories = [ 

22 "Close_Punctuation", 

23 "Currency_Symbol", 

24 "IsAlnum", 

25 "IsAlpha", 

26 "IsLower", 

27 "IsN", 

28 "IsSc", 

29 "IsSo", 

30 "IsUpper", 

31 "Line_Separator", 

32 "Number", 

33 "Open_Punctuation", 

34 "Punctuation", 

35 "Separator", 

36 "Symbol", 

37 "Lowercase_Letter", 

38 "Titlecase_Letter", 

39 "Uppercase_Letter", 

40 "IsPf", 

41 "IsPi", 

42 "CJKSymbols", 

43 "CJK", 

44 ] 

45 

46 def chars(self, category=None): 

47 """ 

48 This module returns a list of characters from the Perl Unicode Properties. 

49 They are very useful when porting Perl tokenizers to Python. 

50 

51 >>> from sacremoses.corpus import Perluniprops 

52 >>> pup = Perluniprops() 

53 >>> list(pup.chars('Open_Punctuation'))[:5] == ['(', '[', '{', '\u0f3a', '\u0f3c'] 

54 True 

55 >>> list(pup.chars('Currency_Symbol'))[:5] == ['$', '\xa2', '\xa3', '\xa4', '\xa5'] 

56 True 

57 >>> pup.available_categories[:5] 

58 ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower'] 

59 

60 :return: a generator of characters given the specific unicode character category 

61 """ 

62 relative_path = os.path.join("data", "perluniprops", category + ".txt") 

63 binary_data = pkgutil.get_data("sacremoses", relative_path) 

64 for ch in binary_data.decode("utf-8"): 

65 yield ch 

66 

67 

68class NonbreakingPrefixes: 

69 """ 

70 This is a class to read the nonbreaking prefixes textfiles from the 

71 Moses Machine Translation toolkit. These lists are used in the Python port 

72 of the Moses' word tokenizer. 

73 """ 

74 

75 def __init__(self): 

76 self.datadir = ( 

77 os.path.dirname(os.path.abspath(__file__)) + "/data/nonbreaking_prefixes/" 

78 ) 

79 self.available_langs = { 

80 "assamese": "as", 

81 "bengali": "bn", 

82 "catalan": "ca", 

83 "czech": "cs", 

84 "german": "de", 

85 "greek": "el", 

86 "english": "en", 

87 "spanish": "es", 

88 "estonian": "et", 

89 "finnish": "fi", 

90 "french": "fr", 

91 "irish": "ga", 

92 "gujarati": "gu", 

93 "hindi": "hi", 

94 "hungarian": "hu", 

95 "icelandic": "is", 

96 "italian": "it", 

97 "kannada": "kn", 

98 "lithuanian": "lt", 

99 "latvian": "lv", 

100 "malayalam": "ml", 

101 "manipuri": "mni", 

102 "marathi": "mr", 

103 "dutch": "nl", 

104 "oriya": "or", 

105 "punjabi": "pa", 

106 "polish": "pl", 

107 "portuguese": "pt", 

108 "romanian": "ro", 

109 "russian": "ru", 

110 "slovak": "sk", 

111 "slovenian": "sl", 

112 "swedish": "sv", 

113 "tamil": "ta", 

114 "telugu": "te", 

115 "tetum": "tdt", 

116 "cantonese": "yue", 

117 "chinese": "zh", 

118 } 

119 # Also, add the lang IDs as the keys. 

120 self.available_langs.update({v: v for v in self.available_langs.values()}) 

121 

122 def words(self, lang=None, ignore_lines_startswith="#"): 

123 """ 

124 This module returns a list of nonbreaking prefixes for the specified 

125 language(s). 

126 

127 >>> from sacremoses.corpus import NonbreakingPrefixes 

128 >>> nbp = NonbreakingPrefixes() 

129 >>> list(nbp.words('en'))[:10] == ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] 

130 True 

131 >>> list(nbp.words('ta'))[:5] == ['\u0bb0', '\u0bc2', '\u0ba4\u0bbf\u0bb0\u0bc1', '\u0b8f', '\u0baa\u0bc0'] 

132 True 

133 

134 :return: a generator words for the specified language(s). 

135 """ 

136 # If *lang* in list of languages available, allocate apt fileid. 

137 if lang in self.available_langs: 

138 filenames = ["nonbreaking_prefix." + self.available_langs[lang]] 

139 # Use non-breaking prefixes for all languages when lang==None. 

140 elif lang == None: 

141 filenames = [ 

142 "nonbreaking_prefix." + v for v in set(self.available_langs.values()) 

143 ] 

144 else: 

145 filenames = ["nonbreaking_prefix.en"] 

146 

147 for filename in filenames: 

148 relative_path = os.path.join("data", "nonbreaking_prefixes", filename) 

149 binary_data = pkgutil.get_data("sacremoses", relative_path) 

150 for line in binary_data.decode("utf-8").splitlines(): 

151 line = line.strip() 

152 if line and not line.startswith(ignore_lines_startswith): 

153 yield line 

154 

155 

156__all__ = ["Perluniprops", "NonbreakingPrefixes"]