Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/util.py: 68%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

37 statements  

1#!/usr/bin/env python3 

2# -*- coding: utf-8 -*- 

3 

4from itertools import tee, zip_longest 

5from xml.sax.saxutils import escape, unescape 

6 

7from joblib import Parallel, delayed 

8from tqdm import tqdm 

9 

10 

11class CJKChars(object): 

12 """ 

13 An object that enumerates the code points of the CJK characters as listed on 

14 http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane 

15 """ 

16 

17 # Hangul Jamo (1100–11FF) 

18 Hangul_Jamo = (4352, 4607) # (ord("\u1100"), ord("\u11ff")) 

19 

20 # CJK Radicals Supplement (2E80–2EFF) 

21 # Kangxi Radicals (2F00–2FDF) 

22 # Ideographic Description Characters (2FF0–2FFF) 

23 # CJK Symbols and Punctuation (3000–303F) 

24 # Hiragana (3040–309F) 

25 # Katakana (30A0–30FF) 

26 # Bopomofo (3100–312F) 

27 # Hangul Compatibility Jamo (3130–318F) 

28 # Kanbun (3190–319F) 

29 # Bopomofo Extended (31A0–31BF) 

30 # CJK Strokes (31C0–31EF) 

31 # Katakana Phonetic Extensions (31F0–31FF) 

32 # Enclosed CJK Letters and Months (3200–32FF) 

33 # CJK Compatibility (3300–33FF) 

34 # CJK Unified Ideographs Extension A (3400–4DBF) 

35 # Yijing Hexagram Symbols (4DC0–4DFF) 

36 # CJK Unified Ideographs (4E00–9FFF) 

37 # Yi Syllables (A000–A48F) 

38 # Yi Radicals (A490–A4CF) 

39 CJK_Radicals = (11904, 42191) # (ord("\u2e80"), ord("\ua4cf")) 

40 

41 # Phags-pa (A840–A87F) 

42 Phags_Pa = (43072, 43135) # (ord("\ua840"), ord("\ua87f")) 

43 

44 # Hangul Syllables (AC00–D7AF) 

45 Hangul_Syllables = (44032, 55215) # (ord("\uAC00"), ord("\uD7AF")) 

46 

47 # CJK Compatibility Ideographs (F900–FAFF) 

48 CJK_Compatibility_Ideographs = (63744, 64255) # (ord("\uF900"), ord("\uFAFF")) 

49 

50 # CJK Compatibility Forms (FE30–FE4F) 

51 CJK_Compatibility_Forms = (65072, 65103) # (ord("\uFE30"), ord("\uFE4F")) 

52 

53 # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters 

54 Katakana_Hangul_Halfwidth = (65381, 65500) # (ord("\uFF65"), ord("\uFFDC")) 

55 

56 # Ideographic Symbols and Punctuation (16FE0–16FFF) 

57 Ideographic_Symbols_And_Punctuation = ( 

58 94176, 

59 94207, 

60 ) # (ord("\U00016FE0"), ord("\U00016FFF")) 

61 

62 # Tangut (17000-187FF) 

63 # Tangut Components (18800-18AFF) 

64 Tangut = (94208, 101119) # (ord("\U00017000"), ord("\U00018AFF")) 

65 

66 # Kana Supplement (1B000-1B0FF) 

67 # Kana Extended-A (1B100-1B12F) 

68 Kana_Supplement = (110592, 110895) # (ord("\U0001B000"), ord("\U0001B12F")) 

69 

70 # Nushu (1B170-1B2FF) 

71 Nushu = (110960, 111359) # (ord("\U0001B170"), ord("\U0001B2FF")) 

72 

73 # Supplementary Ideographic Plane (20000–2FFFF) 

74 Supplementary_Ideographic_Plane = ( 

75 131072, 

76 196607, 

77 ) # (ord("\U00020000"), ord("\U0002FFFF")) 

78 

79 ranges = [ 

80 Hangul_Jamo, 

81 CJK_Radicals, 

82 Phags_Pa, 

83 Hangul_Syllables, 

84 CJK_Compatibility_Ideographs, 

85 CJK_Compatibility_Forms, 

86 Katakana_Hangul_Halfwidth, 

87 Tangut, 

88 Kana_Supplement, 

89 Nushu, 

90 Supplementary_Ideographic_Plane, 

91 ] 

92 

93 

94def is_cjk(character): 

95 """ 

96 This checks for CJK character. 

97 

98 >>> CJKChars().ranges 

99 [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (94208, 101119), (110592, 110895), (110960, 111359), (131072, 196607)] 

100 >>> is_cjk('\u33fe') 

101 True 

102 >>> is_cjk('\uFE5F') 

103 False 

104 

105 :param character: The character that needs to be checked. 

106 :type character: char 

107 :return: bool 

108 """ 

109 return any( 

110 [ 

111 start <= ord(character) <= end 

112 for start, end in [ 

113 (4352, 4607), 

114 (11904, 42191), 

115 (43072, 43135), 

116 (44032, 55215), 

117 (63744, 64255), 

118 (65072, 65103), 

119 (65381, 65500), 

120 (94208, 101119), 

121 (110592, 110895), 

122 (110960, 111359), 

123 (131072, 196607), 

124 ] 

125 ] 

126 ) 

127 

128 

129def xml_escape(text): 

130 """ 

131 This function transforms the input text into an "escaped" version suitable 

132 for well-formed XML formatting. 

133 Note that the default xml.sax.saxutils.escape() function don't escape 

134 some characters that Moses does so we have to manually add them to the 

135 entities dictionary. 

136 

137 >>> input_str = ''')| & < > ' " ] [''' 

138 >>> expected_output = ''')| &amp; &lt; &gt; ' " ] [''' 

139 >>> escape(input_str) == expected_output 

140 True 

141 >>> xml_escape(input_str) 

142 ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;' 

143 

144 :param text: The text that needs to be escaped. 

145 :type text: str 

146 :rtype: str 

147 """ 

148 return escape( 

149 text, 

150 entities={ 

151 r"'": r"&apos;", 

152 r'"': r"&quot;", 

153 r"|": r"&#124;", 

154 r"[": r"&#91;", 

155 r"]": r"&#93;", 

156 }, 

157 ) 

158 

159 

160def xml_unescape(text): 

161 """ 

162 This function transforms the "escaped" version suitable 

163 for well-formed XML formatting into humanly-readable string. 

164 Note that the default xml.sax.saxutils.unescape() function don't unescape 

165 some characters that Moses does so we have to manually add them to the 

166 entities dictionary. 

167 

168 >>> from xml.sax.saxutils import unescape 

169 >>> s = ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;' 

170 >>> expected = ''')| & < > \' " ] [''' 

171 >>> xml_unescape(s) == expected 

172 True 

173 

174 :param text: The text that needs to be unescaped. 

175 :type text: str 

176 :rtype: str 

177 """ 

178 return unescape( 

179 text, 

180 entities={ 

181 r"&apos;": r"'", 

182 r"&quot;": r'"', 

183 r"&#124;": r"|", 

184 r"&#91;": r"[", 

185 r"&#93;": r"]", 

186 }, 

187 ) 

188 

189 

190def pairwise(iterable): 

191 """ 

192 From https://docs.python.org/3/library/itertools.html#recipes 

193 s -> (s0,s1), (s1,s2), (s2, s3), ... 

194 """ 

195 a, b = tee(iterable) 

196 next(b, None) 

197 return zip(a, b) 

198 

199 

200def grouper(iterable, n, fillvalue=None): 

201 """Collect data into fixed-length chunks or blocks 

202 from https://stackoverflow.com/a/16789869/610569 

203 """ 

204 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" 

205 args = [iter(iterable)] * n 

206 return zip_longest(*args, fillvalue=fillvalue) 

207 

208 

209def parallelize_preprocess(func, iterator, processes, progress_bar=False): 

210 iterator = tqdm(iterator) if progress_bar else iterator 

211 if processes <= 1: 

212 return map(func, iterator) 

213 return Parallel(n_jobs=processes)(delayed(func)(line) for line in iterator)