Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/grapheme.py: 23%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Grapheme cluster segmentation following Unicode Standard Annex #29.
4This module provides pure-Python implementation of the grapheme cluster boundary algorithm as
5defined in UAX #29: Unicode Text Segmentation.
7https://www.unicode.org/reports/tr29/
8"""
10from __future__ import annotations
12# std imports
13import sys
14import unicodedata
15from enum import IntEnum
16from functools import lru_cache
18from typing import TYPE_CHECKING, Optional, NamedTuple
20__lazy_modules__ = [
21 "wcwidth.bisearch",
22 "wcwidth.table_grapheme",
23]
24# local
25from .bisearch import bisearch as _bisearch
26from .table_grapheme import (GRAPHEME_L,
27 GRAPHEME_T,
28 GRAPHEME_V,
29 GRAPHEME_LV,
30 INCB_EXTEND,
31 INCB_LINKER,
32 GRAPHEME_LVT,
33 INCB_CONSONANT,
34 GRAPHEME_EXTEND,
35 GRAPHEME_CONTROL,
36 GRAPHEME_PREPEND,
37 GRAPHEME_SPACINGMARK,
38 EXTENDED_PICTOGRAPHIC,
39 GRAPHEME_REGIONAL_INDICATOR)
41if TYPE_CHECKING: # pragma: no cover
42 # std imports
43 from collections.abc import Iterator
45# check for python 3.15 for new iter_graphemes() function
46_HAS_PYTHON315_ITER_GRAPHEMES = (
47 sys.version_info >= (3, 15)
48 and hasattr(unicodedata, 'iter_graphemes')
49)
51# Maximum backward scan distance when finding grapheme cluster boundaries.
52# Covers all known Unicode grapheme clusters with margin; longer sequences are pathological.
53MAX_GRAPHEME_SCAN = 32
56class GCB(IntEnum):
57 """Grapheme Cluster Break property values."""
59 OTHER = 0
60 CR = 1
61 LF = 2
62 CONTROL = 3
63 EXTEND = 4
64 ZWJ = 5
65 REGIONAL_INDICATOR = 6
66 PREPEND = 7
67 SPACING_MARK = 8
68 L = 9
69 V = 10
70 T = 11
71 LV = 12
72 LVT = 13
75# All lru_cache sizes in this file use maxsize=1024, chosen by benchmarking UDHR data (500+
76# languages) and considering typical process-long sessions: western scripts need ~64 unique
77# codepoints, but CJK could reach ~2000 -- but likely not.
78@lru_cache(maxsize=1024)
79def _grapheme_cluster_break(ucs: int) -> GCB:
80 # pylint: disable=too-many-branches,too-complex
81 """Return the Grapheme_Cluster_Break property for a codepoint."""
82 # Single codepoint matches
83 if ucs == 0x000d:
84 return GCB.CR
85 if ucs == 0x000a:
86 return GCB.LF
87 if ucs == 0x200d:
88 return GCB.ZWJ
89 # Matching by codepoint ranges, requiring binary search
90 if _bisearch(ucs, GRAPHEME_CONTROL):
91 return GCB.CONTROL
92 if _bisearch(ucs, GRAPHEME_EXTEND):
93 return GCB.EXTEND
94 if _bisearch(ucs, GRAPHEME_REGIONAL_INDICATOR):
95 return GCB.REGIONAL_INDICATOR
96 if _bisearch(ucs, GRAPHEME_PREPEND):
97 return GCB.PREPEND
98 if _bisearch(ucs, GRAPHEME_SPACINGMARK):
99 return GCB.SPACING_MARK
100 if _bisearch(ucs, GRAPHEME_L):
101 return GCB.L
102 if _bisearch(ucs, GRAPHEME_V):
103 return GCB.V
104 if _bisearch(ucs, GRAPHEME_T):
105 return GCB.T
106 if _bisearch(ucs, GRAPHEME_LV):
107 return GCB.LV
108 if _bisearch(ucs, GRAPHEME_LVT):
109 return GCB.LVT
110 return GCB.OTHER
113@lru_cache(maxsize=1024)
114def _is_extended_pictographic(ucs: int) -> bool:
115 """Check if codepoint has Extended_Pictographic property."""
116 return bool(_bisearch(ucs, EXTENDED_PICTOGRAPHIC))
119@lru_cache(maxsize=1024)
120def _is_incb_linker(ucs: int) -> bool:
121 """Check if codepoint has InCB=Linker property."""
122 return bool(_bisearch(ucs, INCB_LINKER))
125@lru_cache(maxsize=1024)
126def _is_incb_consonant(ucs: int) -> bool:
127 """Check if codepoint has InCB=Consonant property."""
128 return bool(_bisearch(ucs, INCB_CONSONANT))
131@lru_cache(maxsize=1024)
132def _is_incb_extend(ucs: int) -> bool:
133 """Check if codepoint has InCB=Extend property."""
134 return bool(_bisearch(ucs, INCB_EXTEND))
137class BreakResult(NamedTuple):
138 """Result of grapheme cluster break decision."""
140 should_break: bool
141 ri_count: int
144@lru_cache(maxsize=1024)
145def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> Optional[BreakResult]:
146 """
147 Check simple GCB-pair-based break rules (cacheable).
149 Returns BreakResult for rules that can be determined from GCB properties alone, or None if
150 complex lookback rules (GB9c, GB11) need to be checked.
151 """
152 # GB3: CR x LF
153 if prev_gcb == GCB.CR and curr_gcb == GCB.LF:
154 return BreakResult(should_break=False, ri_count=0)
156 # GB4: (Control|CR|LF) ÷
157 if prev_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
158 return BreakResult(should_break=True, ri_count=0)
160 # GB5: ÷ (Control|CR|LF)
161 if curr_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):
162 return BreakResult(should_break=True, ri_count=0)
164 # GB6: L x (L|V|LV|LVT)
165 if prev_gcb == GCB.L and curr_gcb in (GCB.L, GCB.V, GCB.LV, GCB.LVT):
166 return BreakResult(should_break=False, ri_count=0)
168 # GB7: (LV|V) x (V|T)
169 if prev_gcb in (GCB.LV, GCB.V) and curr_gcb in (GCB.V, GCB.T):
170 return BreakResult(should_break=False, ri_count=0)
172 # GB8: (LVT|T) x T
173 if prev_gcb in (GCB.LVT, GCB.T) and curr_gcb == GCB.T:
174 return BreakResult(should_break=False, ri_count=0)
176 # GB9: x (Extend|ZWJ) - but ZWJ needs GB11 check, so only handle Extend here
177 if curr_gcb == GCB.EXTEND:
178 return BreakResult(should_break=False, ri_count=0)
180 # GB9a: x SpacingMark
181 if curr_gcb == GCB.SPACING_MARK:
182 return BreakResult(should_break=False, ri_count=0)
184 # GB9b: Prepend x
185 if prev_gcb == GCB.PREPEND:
186 return BreakResult(should_break=False, ri_count=0)
188 # GB9c and GB11 need lookback - return None to signal complex check needed
189 # GB12/13 (RI pairs) need ri_count state - also handled in main function
190 return None
193def _should_break(
194 prev_gcb: GCB,
195 curr_gcb: GCB,
196 text: str,
197 curr_idx: int,
198 ri_count: int,
199) -> BreakResult:
200 # pylint: disable=too-many-branches,too-complex
201 """
202 Determine if there should be a grapheme cluster break between prev and curr.
204 Implements UAX #29 grapheme cluster boundary rules.
205 """
206 # Try cached simple rules first
207 result = _simple_break_check(prev_gcb, curr_gcb)
208 if result is not None:
209 return result
211 # GB9: x ZWJ (not cached because GB11 needs lookback when prev is ZWJ)
212 if curr_gcb == GCB.ZWJ:
213 return BreakResult(should_break=False, ri_count=0)
215 # GB9c: Indic conjunct cluster
216 # \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* \p{InCB=Linker}
217 # [\p{InCB=Extend}\p{InCB=Linker}]* x \p{InCB=Consonant}
218 curr_ucs = ord(text[curr_idx])
219 if _is_incb_consonant(curr_ucs):
220 has_linker = False
221 i = curr_idx - 1
222 while i >= 0:
223 prev_ucs = ord(text[i])
224 if _is_incb_linker(prev_ucs):
225 has_linker = True
226 i -= 1
227 elif _is_incb_extend(prev_ucs):
228 i -= 1
229 elif _is_incb_consonant(prev_ucs):
230 if has_linker:
231 return BreakResult(should_break=False, ri_count=0)
232 break
233 else:
234 break
236 # GB11: ExtPict Extend* ZWJ x ExtPict
237 if prev_gcb == GCB.ZWJ and _is_extended_pictographic(curr_ucs):
238 i = curr_idx - 2 # Skip the ZWJ at curr_idx - 1
239 while i >= 0:
240 prev_ucs = ord(text[i])
241 prev_prop = _grapheme_cluster_break(prev_ucs)
242 if prev_prop == GCB.EXTEND:
243 i -= 1
244 elif _is_extended_pictographic(prev_ucs):
245 return BreakResult(should_break=False, ri_count=0)
246 else:
247 break
249 # GB12/GB13: RI x RI (pair matching)
250 if prev_gcb == GCB.REGIONAL_INDICATOR and curr_gcb == GCB.REGIONAL_INDICATOR:
251 if ri_count % 2 == 1:
252 return BreakResult(should_break=False, ri_count=ri_count + 1)
253 return BreakResult(should_break=True, ri_count=1)
255 # GB999: Any ÷ Any
256 ri_count = 1 if curr_gcb == GCB.REGIONAL_INDICATOR else 0
257 return BreakResult(should_break=True, ri_count=ri_count)
260def _iter_graphemes_stdlib(
261 unistr: str,
262 start: int = 0,
263 end: Optional[int] = None,
264) -> Iterator[str]:
265 r"""
266 Iterate over grapheme clusters using :func:`unicodedata.iter_graphemes`.
268 Grapheme clusters are "user-perceived characters" - what a user would
269 consider a single character, which may consist of multiple Unicode
270 codepoints (e.g., a base character with combining marks, emoji sequences).
272 :param unistr: The Unicode string to segment.
273 :param start: Starting index (default 0).
274 :param end: Ending index (default len(unistr)).
275 :yields: Grapheme cluster substrings.
277 Example::
279 >>> list(iter_graphemes('cafe\u0301'))
280 ['c', 'a', 'f', 'e\u0301']
281 >>> list(iter_graphemes('ok\U0001F468\u200D\U0001F469\u200D\U0001F467'))
282 ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467']
283 >>> list(iter_graphemes('ok\U0001F1FA\U0001F1F8'))
284 ['o', 'k', '\U0001F1FA\U0001F1F8']
286 .. versionadded:: 0.3.0
287 """
288 if not unistr:
289 return
291 length = len(unistr)
293 if end is None:
294 end = length
296 if start >= end or start >= length:
297 return
299 end = min(end, length)
301 full_segment = unistr[start:end]
302 for seg in unicodedata.iter_graphemes(full_segment): # type: ignore[attr-defined] # pylint: disable=no-member
303 yield full_segment[seg.start:seg.end]
306def _iter_graphemes_python(
307 unistr: str,
308 start: int = 0,
309 end: int | None = None,
310) -> Iterator[str]:
311 r"""
312 Iterate over grapheme clusters using :func:`unicodedata.iter_graphemes`.
314 Grapheme clusters are "user-perceived characters" - what a user would
315 consider a single character, which may consist of multiple Unicode
316 codepoints (e.g., a base character with combining marks, emoji sequences).
318 :param unistr: The Unicode string to segment.
319 :param start: Starting index (default 0).
320 :param end: Ending index (default len(unistr)).
321 :yields: Grapheme cluster substrings.
323 Example::
325 >>> list(iter_graphemes('cafe\u0301'))
326 ['c', 'a', 'f', 'e\u0301']
327 >>> list(iter_graphemes('ok\U0001F468\u200D\U0001F469\u200D\U0001F467'))
328 ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467']
329 >>> list(iter_graphemes('ok\U0001F1FA\U0001F1F8'))
330 ['o', 'k', '\U0001F1FA\U0001F1F8']
332 .. versionadded:: 0.3.0
333 """
334 if not unistr:
335 return
337 length = len(unistr)
339 if end is None:
340 end = length
342 if start >= end or start >= length:
343 return
345 end = min(end, length)
347 # Track state for grapheme cluster boundaries
348 cluster_start = start
349 ri_count = 0
351 # Get GCB for first character
352 prev_gcb = _grapheme_cluster_break(ord(unistr[start]))
354 # Handle Regional Indicator count initialization
355 if prev_gcb == GCB.REGIONAL_INDICATOR:
356 ri_count = 1
358 for idx in range(start + 1, end):
359 curr_gcb = _grapheme_cluster_break(ord(unistr[idx]))
361 result = _should_break(prev_gcb, curr_gcb, unistr, idx, ri_count)
362 ri_count = result.ri_count
364 if result.should_break:
365 yield unistr[cluster_start:idx]
366 cluster_start = idx
368 prev_gcb = curr_gcb
370 # Yield the final cluster
371 yield unistr[cluster_start:end]
374def _find_cluster_start(text: str, pos: int) -> int:
375 """
376 Find the start of the grapheme cluster containing the character before pos.
378 Scans backwards from pos to find a safe starting point, then iterates forward using standard
379 break rules to find the actual cluster boundary.
381 :param text: The Unicode string.
382 :param pos: Position to search before (exclusive).
383 :returns: Start position of the grapheme cluster.
384 """
385 target_cp = ord(text[pos - 1])
387 # GB3: CR x LF - LF after CR is part of same cluster
388 if target_cp == 0x0A and pos >= 2 and text[pos - 2] == '\r':
389 return pos - 2
391 # Fast path: ASCII (except LF) starts its own cluster
392 if target_cp < 0x80:
393 # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic)
394 if pos >= 2 and target_cp >= 0x20:
395 prev_cp = ord(text[pos - 2])
396 if prev_cp >= 0x80 and _grapheme_cluster_break(prev_cp) == GCB.PREPEND:
397 return _find_cluster_start(text, pos - 1)
398 return pos - 1
400 # Scan backward to find a safe starting point
401 safe_start = pos - 1
402 while safe_start > 0 and (pos - safe_start) < MAX_GRAPHEME_SCAN:
403 cp = ord(text[safe_start])
404 if 0x20 <= cp < 0x80: # ASCII always starts a cluster
405 break
406 if _grapheme_cluster_break(cp) == GCB.CONTROL: # GB4
407 break
408 safe_start -= 1
410 # Verify forward to find the actual cluster boundary
411 cluster_start = safe_start
412 left_gcb = _grapheme_cluster_break(ord(text[safe_start]))
413 ri_count = 1 if left_gcb == GCB.REGIONAL_INDICATOR else 0
415 for i in range(safe_start + 1, pos):
416 right_gcb = _grapheme_cluster_break(ord(text[i]))
417 result = _should_break(left_gcb, right_gcb, text, i, ri_count)
418 ri_count = result.ri_count
419 if result.should_break:
420 cluster_start = i
421 left_gcb = right_gcb
423 return cluster_start
426def grapheme_boundary_before(unistr: str, pos: int) -> int:
427 r"""
428 Find the grapheme cluster boundary immediately before a position.
430 :param unistr: The Unicode string to search.
431 :param pos: Position in the string (0 < pos <= len(unistr)).
432 :returns: Start index of the grapheme cluster containing the character at pos-1.
434 Example::
436 >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8)
437 6
438 >>> grapheme_boundary_before('a\r\nb', 3)
439 1
441 .. versionadded:: 0.3.6
442 """
443 if pos <= 0:
444 return 0
445 return _find_cluster_start(unistr, min(pos, len(unistr)))
448def iter_graphemes_reverse(
449 unistr: str,
450 start: int = 0,
451 end: Optional[int] = None,
452) -> Iterator[str]:
453 r"""
454 Iterate over grapheme clusters in reverse order (last to first).
456 :param unistr: The Unicode string to segment.
457 :param start: Starting index (default 0).
458 :param end: Ending index (default len(unistr)).
459 :yields: Grapheme cluster substrings in reverse order.
461 Example::
463 >>> list(iter_graphemes_reverse('cafe\u0301'))
464 ['e\u0301', 'f', 'a', 'c']
466 .. versionadded:: 0.3.6
467 """
468 if not unistr:
469 return
471 length = len(unistr)
473 end = length if end is None else min(end, length)
474 start = max(start, 0)
476 if start >= end or start >= length:
477 return
479 pos = end
480 while pos > start:
481 cluster_start = _find_cluster_start(unistr, pos)
482 # Don't yield partial graphemes that extend before start
483 if cluster_start < start:
484 break
485 yield unistr[cluster_start:pos]
486 pos = cluster_start
489# Bind iter_graphemes at module level to avoid per-call dispatch overhead.
490iter_graphemes = (
491 _iter_graphemes_stdlib if _HAS_PYTHON315_ITER_GRAPHEMES
492 else _iter_graphemes_python
493)