Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/sacremoses/util.py: 68%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
4from itertools import tee, zip_longest
5from xml.sax.saxutils import escape, unescape
7from joblib import Parallel, delayed
8from tqdm import tqdm
11class CJKChars(object):
12 """
13 An object that enumerates the code points of the CJK characters as listed on
14 http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
15 """
17 # Hangul Jamo (1100–11FF)
18 Hangul_Jamo = (4352, 4607) # (ord("\u1100"), ord("\u11ff"))
20 # CJK Radicals Supplement (2E80–2EFF)
21 # Kangxi Radicals (2F00–2FDF)
22 # Ideographic Description Characters (2FF0–2FFF)
23 # CJK Symbols and Punctuation (3000–303F)
24 # Hiragana (3040–309F)
25 # Katakana (30A0–30FF)
26 # Bopomofo (3100–312F)
27 # Hangul Compatibility Jamo (3130–318F)
28 # Kanbun (3190–319F)
29 # Bopomofo Extended (31A0–31BF)
30 # CJK Strokes (31C0–31EF)
31 # Katakana Phonetic Extensions (31F0–31FF)
32 # Enclosed CJK Letters and Months (3200–32FF)
33 # CJK Compatibility (3300–33FF)
34 # CJK Unified Ideographs Extension A (3400–4DBF)
35 # Yijing Hexagram Symbols (4DC0–4DFF)
36 # CJK Unified Ideographs (4E00–9FFF)
37 # Yi Syllables (A000–A48F)
38 # Yi Radicals (A490–A4CF)
39 CJK_Radicals = (11904, 42191) # (ord("\u2e80"), ord("\ua4cf"))
41 # Phags-pa (A840–A87F)
42 Phags_Pa = (43072, 43135) # (ord("\ua840"), ord("\ua87f"))
44 # Hangul Syllables (AC00–D7AF)
45 Hangul_Syllables = (44032, 55215) # (ord("\uAC00"), ord("\uD7AF"))
47 # CJK Compatibility Ideographs (F900–FAFF)
48 CJK_Compatibility_Ideographs = (63744, 64255) # (ord("\uF900"), ord("\uFAFF"))
50 # CJK Compatibility Forms (FE30–FE4F)
51 CJK_Compatibility_Forms = (65072, 65103) # (ord("\uFE30"), ord("\uFE4F"))
53 # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
54 Katakana_Hangul_Halfwidth = (65381, 65500) # (ord("\uFF65"), ord("\uFFDC"))
56 # Ideographic Symbols and Punctuation (16FE0–16FFF)
57 Ideographic_Symbols_And_Punctuation = (
58 94176,
59 94207,
60 ) # (ord("\U00016FE0"), ord("\U00016FFF"))
62 # Tangut (17000-187FF)
63 # Tangut Components (18800-18AFF)
64 Tangut = (94208, 101119) # (ord("\U00017000"), ord("\U00018AFF"))
66 # Kana Supplement (1B000-1B0FF)
67 # Kana Extended-A (1B100-1B12F)
68 Kana_Supplement = (110592, 110895) # (ord("\U0001B000"), ord("\U0001B12F"))
70 # Nushu (1B170-1B2FF)
71 Nushu = (110960, 111359) # (ord("\U0001B170"), ord("\U0001B2FF"))
73 # Supplementary Ideographic Plane (20000–2FFFF)
74 Supplementary_Ideographic_Plane = (
75 131072,
76 196607,
77 ) # (ord("\U00020000"), ord("\U0002FFFF"))
79 ranges = [
80 Hangul_Jamo,
81 CJK_Radicals,
82 Phags_Pa,
83 Hangul_Syllables,
84 CJK_Compatibility_Ideographs,
85 CJK_Compatibility_Forms,
86 Katakana_Hangul_Halfwidth,
87 Tangut,
88 Kana_Supplement,
89 Nushu,
90 Supplementary_Ideographic_Plane,
91 ]
94def is_cjk(character):
95 """
96 This checks for CJK character.
98 >>> CJKChars().ranges
99 [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (94208, 101119), (110592, 110895), (110960, 111359), (131072, 196607)]
100 >>> is_cjk('\u33fe')
101 True
102 >>> is_cjk('\uFE5F')
103 False
105 :param character: The character that needs to be checked.
106 :type character: char
107 :return: bool
108 """
109 return any(
110 [
111 start <= ord(character) <= end
112 for start, end in [
113 (4352, 4607),
114 (11904, 42191),
115 (43072, 43135),
116 (44032, 55215),
117 (63744, 64255),
118 (65072, 65103),
119 (65381, 65500),
120 (94208, 101119),
121 (110592, 110895),
122 (110960, 111359),
123 (131072, 196607),
124 ]
125 ]
126 )
129def xml_escape(text):
130 """
131 This function transforms the input text into an "escaped" version suitable
132 for well-formed XML formatting.
133 Note that the default xml.sax.saxutils.escape() function don't escape
134 some characters that Moses does so we have to manually add them to the
135 entities dictionary.
137 >>> input_str = ''')| & < > ' " ] ['''
138 >>> expected_output = ''')| & < > ' " ] ['''
139 >>> escape(input_str) == expected_output
140 True
141 >>> xml_escape(input_str)
142 ')| & < > ' " ] ['
144 :param text: The text that needs to be escaped.
145 :type text: str
146 :rtype: str
147 """
148 return escape(
149 text,
150 entities={
151 r"'": r"'",
152 r'"': r""",
153 r"|": r"|",
154 r"[": r"[",
155 r"]": r"]",
156 },
157 )
160def xml_unescape(text):
161 """
162 This function transforms the "escaped" version suitable
163 for well-formed XML formatting into humanly-readable string.
164 Note that the default xml.sax.saxutils.unescape() function don't unescape
165 some characters that Moses does so we have to manually add them to the
166 entities dictionary.
168 >>> from xml.sax.saxutils import unescape
169 >>> s = ')| & < > ' " ] ['
170 >>> expected = ''')| & < > \' " ] ['''
171 >>> xml_unescape(s) == expected
172 True
174 :param text: The text that needs to be unescaped.
175 :type text: str
176 :rtype: str
177 """
178 return unescape(
179 text,
180 entities={
181 r"'": r"'",
182 r""": r'"',
183 r"|": r"|",
184 r"[": r"[",
185 r"]": r"]",
186 },
187 )
190def pairwise(iterable):
191 """
192 From https://docs.python.org/3/library/itertools.html#recipes
193 s -> (s0,s1), (s1,s2), (s2, s3), ...
194 """
195 a, b = tee(iterable)
196 next(b, None)
197 return zip(a, b)
200def grouper(iterable, n, fillvalue=None):
201 """Collect data into fixed-length chunks or blocks
202 from https://stackoverflow.com/a/16789869/610569
203 """
204 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
205 args = [iter(iterable)] * n
206 return zip_longest(*args, fillvalue=fillvalue)
209def parallelize_preprocess(func, iterator, processes, progress_bar=False):
210 iterator = tqdm(iterator) if progress_bar else iterator
211 if processes <= 1:
212 return map(func, iterator)
213 return Parallel(n_jobs=processes)(delayed(func)(line) for line in iterator)