Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/corpus.py: 94%

1#!/usr/bin/env python3

2# -*- coding: utf-8 -*-

4import os

5import pkgutil

8class Perluniprops:

9 """

10 This class is used to read lists of characters from the Perl Unicode

11 Properties (see http://perldoc.perl.org/perluniprops.html).

12 The files in the perluniprop.zip are extracted using the Unicode::Tussle

13 module from http://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm

14 """

16 def __init__(self):

17 self.datadir = (

18 os.path.dirname(os.path.abspath(__file__)) + "/data/perluniprops/"

19 )

20 # These are categories similar to the Perl Unicode Properties

21 self.available_categories = [

22 "Close_Punctuation",

23 "Currency_Symbol",

24 "IsAlnum",

25 "IsAlpha",

26 "IsLower",

27 "IsN",

28 "IsSc",

29 "IsSo",

30 "IsUpper",

31 "Line_Separator",

32 "Number",

33 "Open_Punctuation",

34 "Punctuation",

35 "Separator",

36 "Symbol",

37 "Lowercase_Letter",

38 "Titlecase_Letter",

39 "Uppercase_Letter",

40 "IsPf",

41 "IsPi",

42 "CJKSymbols",

43 "CJK",

44 ]

46 def chars(self, category=None):

47 """

48 This module returns a list of characters from the Perl Unicode Properties.

49 They are very useful when porting Perl tokenizers to Python.

51 >>> from sacremoses.corpus import Perluniprops

52 >>> pup = Perluniprops()

53 >>> list(pup.chars('Open_Punctuation'))[:5] == ['(', '[', '{', '\u0f3a', '\u0f3c']

54 True

55 >>> list(pup.chars('Currency_Symbol'))[:5] == ['$', '\xa2', '\xa3', '\xa4', '\xa5']

56 True

57 >>> pup.available_categories[:5]

58 ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower']

60 :return: a generator of characters given the specific unicode character category

61 """

62 relative_path = os.path.join("data", "perluniprops", category + ".txt")

63 binary_data = pkgutil.get_data("sacremoses", relative_path)

64 for ch in binary_data.decode("utf-8"):

65 yield ch

68class NonbreakingPrefixes:

69 """

70 This is a class to read the nonbreaking prefixes textfiles from the

71 Moses Machine Translation toolkit. These lists are used in the Python port

72 of the Moses' word tokenizer.

73 """

75 def __init__(self):

76 self.datadir = (

77 os.path.dirname(os.path.abspath(__file__)) + "/data/nonbreaking_prefixes/"

78 )

79 self.available_langs = {

80 "assamese": "as",

81 "bengali": "bn",

82 "catalan": "ca",

83 "czech": "cs",

84 "german": "de",

85 "greek": "el",

86 "english": "en",

87 "spanish": "es",

88 "estonian": "et",

89 "finnish": "fi",

90 "french": "fr",

91 "irish": "ga",

92 "gujarati": "gu",

93 "hindi": "hi",

94 "hungarian": "hu",

95 "icelandic": "is",

96 "italian": "it",

97 "kannada": "kn",

98 "lithuanian": "lt",

99 "latvian": "lv",

100 "malayalam": "ml",

101 "manipuri": "mni",

102 "marathi": "mr",

103 "dutch": "nl",

104 "oriya": "or",

105 "punjabi": "pa",

106 "polish": "pl",

107 "portuguese": "pt",

108 "romanian": "ro",

109 "russian": "ru",

110 "slovak": "sk",

111 "slovenian": "sl",

112 "swedish": "sv",

113 "tamil": "ta",

114 "telugu": "te",

115 "tetum": "tdt",

116 "cantonese": "yue",

117 "chinese": "zh",

118 }

119 # Also, add the lang IDs as the keys.

120 self.available_langs.update({v: v for v in self.available_langs.values()})

121

122 def words(self, lang=None, ignore_lines_startswith="#"):

123 """

124 This module returns a list of nonbreaking prefixes for the specified

125 language(s).

126

127 >>> from sacremoses.corpus import NonbreakingPrefixes

128 >>> nbp = NonbreakingPrefixes()

129 >>> list(nbp.words('en'))[:10] == ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']

130 True

131 >>> list(nbp.words('ta'))[:5] == ['\u0bb0', '\u0bc2', '\u0ba4\u0bbf\u0bb0\u0bc1', '\u0b8f', '\u0baa\u0bc0']

132 True

133

134 :return: a generator words for the specified language(s).

135 """

136 # If *lang* in list of languages available, allocate apt fileid.

137 if lang in self.available_langs:

138 filenames = ["nonbreaking_prefix." + self.available_langs[lang]]

139 # Use non-breaking prefixes for all languages when lang==None.

140 elif lang == None:

141 filenames = [

142 "nonbreaking_prefix." + v for v in set(self.available_langs.values())

143 ]

144 else:

145 filenames = ["nonbreaking_prefix.en"]

146

147 for filename in filenames:

148 relative_path = os.path.join("data", "nonbreaking_prefixes", filename)

149 binary_data = pkgutil.get_data("sacremoses", relative_path)

150 for line in binary_data.decode("utf-8").splitlines():

151 line = line.strip()

152 if line and not line.startswith(ignore_lines_startswith):

153 yield line

154

155

156__all__ = ["Perluniprops", "NonbreakingPrefixes"]