Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/util.py: 68%

1#!/usr/bin/env python3

2# -*- coding: utf-8 -*-

4from itertools import tee, zip_longest

5from xml.sax.saxutils import escape, unescape

7from joblib import Parallel, delayed

8from tqdm import tqdm

11class CJKChars(object):

12 """

13 An object that enumerates the code points of the CJK characters as listed on

14 http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane

15 """

17 # Hangul Jamo (1100–11FF)

18 Hangul_Jamo = (4352, 4607) # (ord("\u1100"), ord("\u11ff"))

20 # CJK Radicals Supplement (2E80–2EFF)

21 # Kangxi Radicals (2F00–2FDF)

22 # Ideographic Description Characters (2FF0–2FFF)

23 # CJK Symbols and Punctuation (3000–303F)

24 # Hiragana (3040–309F)

25 # Katakana (30A0–30FF)

26 # Bopomofo (3100–312F)

27 # Hangul Compatibility Jamo (3130–318F)

28 # Kanbun (3190–319F)

29 # Bopomofo Extended (31A0–31BF)

30 # CJK Strokes (31C0–31EF)

31 # Katakana Phonetic Extensions (31F0–31FF)

32 # Enclosed CJK Letters and Months (3200–32FF)

33 # CJK Compatibility (3300–33FF)

34 # CJK Unified Ideographs Extension A (3400–4DBF)

35 # Yijing Hexagram Symbols (4DC0–4DFF)

36 # CJK Unified Ideographs (4E00–9FFF)

37 # Yi Syllables (A000–A48F)

38 # Yi Radicals (A490–A4CF)

39 CJK_Radicals = (11904, 42191) # (ord("\u2e80"), ord("\ua4cf"))

41 # Phags-pa (A840–A87F)

42 Phags_Pa = (43072, 43135) # (ord("\ua840"), ord("\ua87f"))

44 # Hangul Syllables (AC00–D7AF)

45 Hangul_Syllables = (44032, 55215) # (ord("\uAC00"), ord("\uD7AF"))

47 # CJK Compatibility Ideographs (F900–FAFF)

48 CJK_Compatibility_Ideographs = (63744, 64255) # (ord("\uF900"), ord("\uFAFF"))

50 # CJK Compatibility Forms (FE30–FE4F)

51 CJK_Compatibility_Forms = (65072, 65103) # (ord("\uFE30"), ord("\uFE4F"))

53 # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters

54 Katakana_Hangul_Halfwidth = (65381, 65500) # (ord("\uFF65"), ord("\uFFDC"))

56 # Ideographic Symbols and Punctuation (16FE0–16FFF)

57 Ideographic_Symbols_And_Punctuation = (

58 94176,

59 94207,

60 ) # (ord("\U00016FE0"), ord("\U00016FFF"))

62 # Tangut (17000-187FF)

63 # Tangut Components (18800-18AFF)

64 Tangut = (94208, 101119) # (ord("\U00017000"), ord("\U00018AFF"))

66 # Kana Supplement (1B000-1B0FF)

67 # Kana Extended-A (1B100-1B12F)

68 Kana_Supplement = (110592, 110895) # (ord("\U0001B000"), ord("\U0001B12F"))

70 # Nushu (1B170-1B2FF)

71 Nushu = (110960, 111359) # (ord("\U0001B170"), ord("\U0001B2FF"))

73 # Supplementary Ideographic Plane (20000–2FFFF)

74 Supplementary_Ideographic_Plane = (

75 131072,

76 196607,

77 ) # (ord("\U00020000"), ord("\U0002FFFF"))

79 ranges = [

80 Hangul_Jamo,

81 CJK_Radicals,

82 Phags_Pa,

83 Hangul_Syllables,

84 CJK_Compatibility_Ideographs,

85 CJK_Compatibility_Forms,

86 Katakana_Hangul_Halfwidth,

87 Tangut,

88 Kana_Supplement,

89 Nushu,

90 Supplementary_Ideographic_Plane,

91 ]

94def is_cjk(character):

95 """

96 This checks for CJK character.

98 >>> CJKChars().ranges

99 [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (94208, 101119), (110592, 110895), (110960, 111359), (131072, 196607)]

100 >>> is_cjk('\u33fe')

101 True

102 >>> is_cjk('\uFE5F')

103 False

104

105 :param character: The character that needs to be checked.

106 :type character: char

107 :return: bool

108 """

109 return any(

110 [

111 start <= ord(character) <= end

112 for start, end in [

113 (4352, 4607),

114 (11904, 42191),

115 (43072, 43135),

116 (44032, 55215),

117 (63744, 64255),

118 (65072, 65103),

119 (65381, 65500),

120 (94208, 101119),

121 (110592, 110895),

122 (110960, 111359),

123 (131072, 196607),

124 ]

125 ]

126 )

127

128

129def xml_escape(text):

130 """

131 This function transforms the input text into an "escaped" version suitable

132 for well-formed XML formatting.

133 Note that the default xml.sax.saxutils.escape() function don't escape

134 some characters that Moses does so we have to manually add them to the

135 entities dictionary.

136

137 >>> input_str = ''')| & < > ' " ] ['''

138 >>> expected_output = ''')| & < > ' " ] ['''

139 >>> escape(input_str) == expected_output

140 True

141 >>> xml_escape(input_str)

142 ')| & < > ' " ] ['

143

144 :param text: The text that needs to be escaped.

145 :type text: str

146 :rtype: str

147 """

148 return escape(

149 text,

150 entities={

151 r"'": r"'",

152 r'"': r""",

153 r"|": r"|",

154 r"[": r"[",

155 r"]": r"]",

156 },

157 )

158

159

160def xml_unescape(text):

161 """

162 This function transforms the "escaped" version suitable

163 for well-formed XML formatting into humanly-readable string.

164 Note that the default xml.sax.saxutils.unescape() function don't unescape

165 some characters that Moses does so we have to manually add them to the

166 entities dictionary.

167

168 >>> from xml.sax.saxutils import unescape

169 >>> s = ')| & < > ' " ] ['

170 >>> expected = ''')| & < > \' " ] ['''

171 >>> xml_unescape(s) == expected

172 True

173

174 :param text: The text that needs to be unescaped.

175 :type text: str

176 :rtype: str

177 """

178 return unescape(

179 text,

180 entities={

181 r"'": r"'",

182 r""": r'"',

183 r"|": r"|",

184 r"[": r"[",

185 r"]": r"]",

186 },

187 )

188

189

190def pairwise(iterable):

191 """

192 From https://docs.python.org/3/library/itertools.html#recipes

193 s -> (s0,s1), (s1,s2), (s2, s3), ...

194 """

195 a, b = tee(iterable)

196 next(b, None)

197 return zip(a, b)

198

199

200def grouper(iterable, n, fillvalue=None):

201 """Collect data into fixed-length chunks or blocks

202 from https://stackoverflow.com/a/16789869/610569

203 """

204 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"

205 args = [iter(iterable)] * n

206 return zip_longest(*args, fillvalue=fillvalue)

207

208

209def parallelize_preprocess(func, iterator, processes, progress_bar=False):

210 iterator = tqdm(iterator) if progress_bar else iterator

211 if processes <= 1:

212 return map(func, iterator)

213 return Parallel(n_jobs=processes)(delayed(func)(line) for line in iterator)