Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/grapheme.py: 22%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Grapheme cluster segmentation following Unicode Standard Annex #29.
4This module provides pure-Python implementation of the grapheme cluster boundary algorithm as
5defined in UAX #29: Unicode Text Segmentation.
7https://www.unicode.org/reports/tr29/
8"""
10from __future__ import annotations
12# std imports
13from enum import IntEnum
14from functools import lru_cache
16from typing import TYPE_CHECKING, NamedTuple
18# local
19from .bisearch import bisearch as _bisearch
20from .table_grapheme import (GRAPHEME_L,
21 GRAPHEME_T,
22 GRAPHEME_V,
23 GRAPHEME_LV,
24 INCB_EXTEND,
25 INCB_LINKER,
26 GRAPHEME_LVT,
27 INCB_CONSONANT,
28 GRAPHEME_EXTEND,
29 GRAPHEME_CONTROL,
30 GRAPHEME_PREPEND,
31 GRAPHEME_SPACINGMARK,
32 EXTENDED_PICTOGRAPHIC,
33 GRAPHEME_REGIONAL_INDICATOR)
35if TYPE_CHECKING: # pragma: no cover
36 # std imports
37 from collections.abc import Iterator
39# Maximum backward scan distance when finding grapheme cluster boundaries.
40# Covers all known Unicode grapheme clusters with margin; longer sequences are pathological.
41MAX_GRAPHEME_SCAN = 32
44class GCB(IntEnum):
45 """Grapheme Cluster Break property values."""
47 OTHER = 0
48 CR = 1
49 LF = 2
50 CONTROL = 3
51 EXTEND = 4
52 ZWJ = 5
53 REGIONAL_INDICATOR = 6
54 PREPEND = 7
55 SPACING_MARK = 8
56 L = 9
57 V = 10
58 T = 11
59 LV = 12
60 LVT = 13
63# All lru_cache sizes in this file use maxsize=1024, chosen by benchmarking UDHR data (500+
64# languages) and considering typical process-long sessions: western scripts need ~64 unique
65# codepoints, but CJK could reach ~2000 -- but likely not.
66@lru_cache(maxsize=1024)
67def _grapheme_cluster_break(ucs: int) -> GCB:
68 # pylint: disable=too-many-branches,too-complex
69 """Return the Grapheme_Cluster_Break property for a codepoint."""
70 # Single codepoint matches
71 if ucs == 0x000d:
72 return GCB.CR
73 if ucs == 0x000a:
74 return GCB.LF
75 if ucs == 0x200d:
76 return GCB.ZWJ
77 # Matching by codepoint ranges, requiring binary search
78 if _bisearch(ucs, GRAPHEME_CONTROL):
79 return GCB.CONTROL
80 if _bisearch(ucs, GRAPHEME_EXTEND):
81 return GCB.EXTEND
82 if _bisearch(ucs, GRAPHEME_REGIONAL_INDICATOR):
83 return GCB.REGIONAL_INDICATOR
84 if _bisearch(ucs, GRAPHEME_PREPEND):
85 return GCB.PREPEND
86 if _bisearch(ucs, GRAPHEME_SPACINGMARK):
87 return GCB.SPACING_MARK
88 if _bisearch(ucs, GRAPHEME_L):
89 return GCB.L
90 if _bisearch(ucs, GRAPHEME_V):
91 return GCB.V
92 if _bisearch(ucs, GRAPHEME_T):
93 return GCB.T
94 if _bisearch(ucs, GRAPHEME_LV):
95 return GCB.LV
96 if _bisearch(ucs, GRAPHEME_LVT):
97 return GCB.LVT
98 return GCB.OTHER
101@lru_cache(maxsize=1024)
102def _is_extended_pictographic(ucs: int) -> bool:
103 """Check if codepoint has Extended_Pictographic property."""
104 return bool(_bisearch(ucs, EXTENDED_PICTOGRAPHIC))
107@lru_cache(maxsize=1024)
108def _is_incb_linker(ucs: int) -> bool:
109 """Check if codepoint has InCB=Linker property."""
110 return bool(_bisearch(ucs, INCB_LINKER))
113@lru_cache(maxsize=1024)
114def _is_incb_consonant(ucs: int) -> bool:
115 """Check if codepoint has InCB=Consonant property."""
116 return bool(_bisearch(ucs, INCB_CONSONANT))
119@lru_cache(maxsize=1024)
120def _is_incb_extend(ucs: int) -> bool:
121 """Check if codepoint has InCB=Extend property."""
122 return bool(_bisearch(ucs, INCB_EXTEND))
125class BreakResult(NamedTuple):
126 """Result of grapheme cluster break decision."""
128 should_break: bool
129 ri_count: int
132@lru_cache(maxsize=1024)
133def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> BreakResult | None:
134 """
135 Check simple GCB-pair-based break rules (cacheable).
137 Returns BreakResult for rules that can be determined from GCB properties alone, or None if
138 complex lookback rules (GB9c, GB11) need to be checked.
139 """
140 # GB3: CR x LF
141 if prev_gcb == GCB.CR and curr_gcb == GCB.LF:
142 return BreakResult(should_break=False, ri_count=0)
144 # GB4: (Control|CR|LF) ÷
145 if prev_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
146 return BreakResult(should_break=True, ri_count=0)
148 # GB5: ÷ (Control|CR|LF)
149 if curr_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
150 return BreakResult(should_break=True, ri_count=0)
152 # GB6: L x (L|V|LV|LVT)
153 if prev_gcb == GCB.L and curr_gcb in (GCB.L, GCB.V, GCB.LV, GCB.LVT):
154 return BreakResult(should_break=False, ri_count=0)
156 # GB7: (LV|V) x (V|T)
157 if prev_gcb in (GCB.LV, GCB.V) and curr_gcb in (GCB.V, GCB.T):
158 return BreakResult(should_break=False, ri_count=0)
160 # GB8: (LVT|T) x T
161 if prev_gcb in (GCB.LVT, GCB.T) and curr_gcb == GCB.T:
162 return BreakResult(should_break=False, ri_count=0)
164 # GB9: x (Extend|ZWJ) - but ZWJ needs GB11 check, so only handle Extend here
165 if curr_gcb == GCB.EXTEND:
166 return BreakResult(should_break=False, ri_count=0)
168 # GB9a: x SpacingMark
169 if curr_gcb == GCB.SPACING_MARK:
170 return BreakResult(should_break=False, ri_count=0)
172 # GB9b: Prepend x
173 if prev_gcb == GCB.PREPEND:
174 return BreakResult(should_break=False, ri_count=0)
176 # GB9c and GB11 need lookback - return None to signal complex check needed
177 # GB12/13 (RI pairs) need ri_count state - also handled in main function
178 return None
181def _should_break(
182 prev_gcb: GCB,
183 curr_gcb: GCB,
184 text: str,
185 curr_idx: int,
186 ri_count: int,
187) -> BreakResult:
188 # pylint: disable=too-many-branches,too-complex
189 """
190 Determine if there should be a grapheme cluster break between prev and curr.
192 Implements UAX #29 grapheme cluster boundary rules.
193 """
194 # Try cached simple rules first
195 result = _simple_break_check(prev_gcb, curr_gcb)
196 if result is not None:
197 return result
199 # GB9: x ZWJ (not cached because GB11 needs lookback when prev is ZWJ)
200 if curr_gcb == GCB.ZWJ:
201 return BreakResult(should_break=False, ri_count=0)
203 # GB9c: Indic conjunct cluster
204 # \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* \p{InCB=Linker}
205 # [\p{InCB=Extend}\p{InCB=Linker}]* x \p{InCB=Consonant}
206 curr_ucs = ord(text[curr_idx])
207 if _is_incb_consonant(curr_ucs):
208 has_linker = False
209 i = curr_idx - 1
210 while i >= 0:
211 prev_ucs = ord(text[i])
212 if _is_incb_linker(prev_ucs):
213 has_linker = True
214 i -= 1
215 elif _is_incb_extend(prev_ucs):
216 i -= 1
217 elif _is_incb_consonant(prev_ucs):
218 if has_linker:
219 return BreakResult(should_break=False, ri_count=0)
220 break
221 else:
222 break
224 # GB11: ExtPict Extend* ZWJ x ExtPict
225 if prev_gcb == GCB.ZWJ and _is_extended_pictographic(curr_ucs):
226 i = curr_idx - 2 # Skip the ZWJ at curr_idx - 1
227 while i >= 0:
228 prev_ucs = ord(text[i])
229 prev_prop = _grapheme_cluster_break(prev_ucs)
230 if prev_prop == GCB.EXTEND:
231 i -= 1
232 elif _is_extended_pictographic(prev_ucs):
233 return BreakResult(should_break=False, ri_count=0)
234 else:
235 break
237 # GB12/GB13: RI x RI (pair matching)
238 if prev_gcb == GCB.REGIONAL_INDICATOR and curr_gcb == GCB.REGIONAL_INDICATOR:
239 if ri_count % 2 == 1:
240 return BreakResult(should_break=False, ri_count=ri_count + 1)
241 return BreakResult(should_break=True, ri_count=1)
243 # GB999: Any ÷ Any
244 ri_count = 1 if curr_gcb == GCB.REGIONAL_INDICATOR else 0
245 return BreakResult(should_break=True, ri_count=ri_count)
248def iter_graphemes(
249 unistr: str,
250 start: int = 0,
251 end: int | None = None,
252) -> Iterator[str]:
253 r"""
254 Iterate over grapheme clusters in a Unicode string.
256 Grapheme clusters are "user-perceived characters" - what a user would
257 consider a single character, which may consist of multiple Unicode
258 codepoints (e.g., a base character with combining marks, emoji sequences).
260 :param unistr: The Unicode string to segment.
261 :param start: Starting index (default 0).
262 :param end: Ending index (default len(unistr)).
263 :yields: Grapheme cluster substrings.
265 Example::
267 >>> list(iter_graphemes('cafe\u0301'))
268 ['c', 'a', 'f', 'e\u0301']
269 >>> list(iter_graphemes('\U0001F468\u200D\U0001F469\u200D\U0001F467'))
270 ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467']
271 >>> list(iter_graphemes('\U0001F1FA\U0001F1F8'))
272 ['o', 'k', '\U0001F1FA\U0001F1F8']
274 .. versionadded:: 0.3.0
275 """
276 if not unistr:
277 return
279 length = len(unistr)
281 if end is None:
282 end = length
284 if start >= end or start >= length:
285 return
287 end = min(end, length)
289 # Track state for grapheme cluster boundaries
290 cluster_start = start
291 ri_count = 0
293 # Get GCB for first character
294 prev_gcb = _grapheme_cluster_break(ord(unistr[start]))
296 # Handle Regional Indicator count initialization
297 if prev_gcb == GCB.REGIONAL_INDICATOR:
298 ri_count = 1
300 for idx in range(start + 1, end):
301 curr_gcb = _grapheme_cluster_break(ord(unistr[idx]))
303 result = _should_break(prev_gcb, curr_gcb, unistr, idx, ri_count)
304 ri_count = result.ri_count
306 if result.should_break:
307 yield unistr[cluster_start:idx]
308 cluster_start = idx
310 prev_gcb = curr_gcb
312 # Yield the final cluster
313 yield unistr[cluster_start:end]
316def _find_cluster_start(text: str, pos: int) -> int:
317 """
318 Find the start of the grapheme cluster containing the character before pos.
320 Scans backwards from pos to find a safe starting point, then iterates forward using standard
321 break rules to find the actual cluster boundary.
323 :param text: The Unicode string.
324 :param pos: Position to search before (exclusive).
325 :returns: Start position of the grapheme cluster.
326 """
327 target_cp = ord(text[pos - 1])
329 # GB3: CR x LF - LF after CR is part of same cluster
330 if target_cp == 0x0A and pos >= 2 and text[pos - 2] == '\r':
331 return pos - 2
333 # Fast path: ASCII (except LF) starts its own cluster
334 if target_cp < 0x80:
335 # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic)
336 if pos >= 2 and target_cp >= 0x20:
337 prev_cp = ord(text[pos - 2])
338 if prev_cp >= 0x80 and _grapheme_cluster_break(prev_cp) == GCB.PREPEND:
339 return _find_cluster_start(text, pos - 1)
340 return pos - 1
342 # Scan backward to find a safe starting point
343 safe_start = pos - 1
344 while safe_start > 0 and (pos - safe_start) < MAX_GRAPHEME_SCAN:
345 cp = ord(text[safe_start])
346 if 0x20 <= cp < 0x80: # ASCII always starts a cluster
347 break
348 if _grapheme_cluster_break(cp) == GCB.CONTROL: # GB4
349 break
350 safe_start -= 1
352 # Verify forward to find the actual cluster boundary
353 cluster_start = safe_start
354 left_gcb = _grapheme_cluster_break(ord(text[safe_start]))
355 ri_count = 1 if left_gcb == GCB.REGIONAL_INDICATOR else 0
357 for i in range(safe_start + 1, pos):
358 right_gcb = _grapheme_cluster_break(ord(text[i]))
359 result = _should_break(left_gcb, right_gcb, text, i, ri_count)
360 ri_count = result.ri_count
361 if result.should_break:
362 cluster_start = i
363 left_gcb = right_gcb
365 return cluster_start
368def grapheme_boundary_before(unistr: str, pos: int) -> int:
369 r"""
370 Find the grapheme cluster boundary immediately before a position.
372 :param unistr: The Unicode string to search.
373 :param pos: Position in the string (0 < pos <= len(unistr)).
374 :returns: Start index of the grapheme cluster containing the character at pos-1.
376 Example::
378 >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8)
379 6
380 >>> grapheme_boundary_before('a\r\nb', 3)
381 1
383 .. versionadded:: 0.3.6
384 """
385 if pos <= 0:
386 return 0
387 return _find_cluster_start(unistr, min(pos, len(unistr)))
390def iter_graphemes_reverse(
391 unistr: str,
392 start: int = 0,
393 end: int | None = None,
394) -> Iterator[str]:
395 r"""
396 Iterate over grapheme clusters in reverse order (last to first).
398 :param unistr: The Unicode string to segment.
399 :param start: Starting index (default 0).
400 :param end: Ending index (default len(unistr)).
401 :yields: Grapheme cluster substrings in reverse order.
403 Example::
405 >>> list(iter_graphemes_reverse('cafe\u0301'))
406 ['e\u0301', 'f', 'a', 'c']
408 .. versionadded:: 0.3.6
409 """
410 if not unistr:
411 return
413 length = len(unistr)
415 end = length if end is None else min(end, length)
416 start = max(start, 0)
418 if start >= end or start >= length:
419 return
421 pos = end
422 while pos > start:
423 cluster_start = _find_cluster_start(unistr, pos)
424 # Don't yield partial graphemes that extend before start
425 if cluster_start < start:
426 break
427 yield unistr[cluster_start:pos]
428 pos = cluster_start