Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/wcwidth.py: 12%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2This is a python implementation of wcwidth() and wcswidth().
4https://github.com/jquast/wcwidth
6from Markus Kuhn's C code, retrieved from:
8 http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
10This is an implementation of wcwidth() and wcswidth() (defined in
11IEEE Std 1002.1-2001) for Unicode.
13http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
14http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
16In fixed-width output devices, Latin characters all occupy a single
17"cell" position of equal width, whereas ideographic CJK characters
18occupy two such cells. Interoperability between terminal-line
19applications and (teletype-style) character terminals using the
20UTF-8 encoding requires agreement on which character should advance
21the cursor by how many cell positions. No established formal
22standards exist at present on which Unicode character shall occupy
23how many cell positions on character terminals. These routines are
24a first attempt of defining such behavior based on simple rules
25applied to data provided by the Unicode Consortium.
27For some graphical characters, the Unicode standard explicitly
28defines a character-cell width via the definition of the East Asian
29FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
30In all these cases, there is no ambiguity about which width a
31terminal shall use. For characters in the East Asian Ambiguous (A)
32class, the width choice depends purely on a preference of backward
33compatibility with either historic CJK or Western practice.
34Choosing single-width for these characters is easy to justify as
35the appropriate long-term solution, as the CJK practice of
36displaying these characters as double-width comes from historic
37implementation simplicity (8-bit encoded characters were displayed
38single-width and 16-bit ones double-width, even for Greek,
39Cyrillic, etc.) and not any typographic considerations.
41Much less clear is the choice of width for the Not East Asian
42(Neutral) class. Existing practice does not dictate a width for any
43of these characters. It would nevertheless make sense
44typographically to allocate two character cells to characters such
45as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
46represented adequately with a single-width glyph. The following
47routines at present merely assign a single-cell width to all
48neutral characters, in the interest of simplicity. This is not
49entirely satisfactory and should be reconsidered before
50establishing a formal standard in this area. At the moment, the
51decision which Not East Asian (Neutral) characters should be
52represented by double-width glyphs cannot yet be answered by
53applying a simple rule from the Unicode database content. Setting
54up a proper standard for the behavior of UTF-8 character terminals
55will require a careful analysis not only of each Unicode character,
56but also of each presentation form, something the author of these
57routines has avoided to do so far.
59http://www.unicode.org/unicode/reports/tr11/
61Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
62"""
64from __future__ import annotations
66# std imports
67from functools import lru_cache
69from typing import TYPE_CHECKING
71# local
72from .bisearch import bisearch as _bisearch
73from .grapheme import iter_graphemes
74from .table_mc import CATEGORY_MC
75from .sgr_state import (_SGR_PATTERN,
76 _SGR_STATE_DEFAULT,
77 _sgr_state_update,
78 _sgr_state_is_active,
79 _sgr_state_to_sequence)
80from .table_vs16 import VS16_NARROW_TO_WIDE
81from .table_wide import WIDE_EASTASIAN
82from .table_zero import ZERO_WIDTH
83from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL
84from .table_grapheme import ISC_CONSONANT, EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR
85from .table_ambiguous import AMBIGUOUS_EASTASIAN
86from .escape_sequences import (ZERO_WIDTH_PATTERN,
87 CURSOR_LEFT_SEQUENCE,
88 CURSOR_RIGHT_SEQUENCE,
89 INDETERMINATE_EFFECT_SEQUENCE)
90from .unicode_versions import list_versions
92if TYPE_CHECKING: # pragma: no cover
93 # std imports
94 from collections.abc import Iterator
96 from typing import Literal
98# Pre-compute table references for the latest (and only) Unicode version.
99_LATEST_VERSION = list_versions()[-1]
100_ZERO_WIDTH_TABLE = ZERO_WIDTH[_LATEST_VERSION]
101_WIDE_EASTASIAN_TABLE = WIDE_EASTASIAN[_LATEST_VERSION]
102_AMBIGUOUS_TABLE = AMBIGUOUS_EASTASIAN[next(iter(AMBIGUOUS_EASTASIAN))]
103_CATEGORY_MC_TABLE = CATEGORY_MC[_LATEST_VERSION]
104_REGIONAL_INDICATOR_SET = frozenset(
105 range(GRAPHEME_REGIONAL_INDICATOR[0][0], GRAPHEME_REGIONAL_INDICATOR[0][1] + 1)
106)
107_EMOJI_ZWJ_SET = frozenset(
108 cp for lo, hi in EXTENDED_PICTOGRAPHIC for cp in range(lo, hi + 1)
109) | _REGIONAL_INDICATOR_SET
110_FITZPATRICK_RANGE = (0x1F3FB, 0x1F3FF)
111# Indic_Syllabic_Category=Virama codepoints, from IndicSyllabicCategory.txt.
112# These are structurally tied to their scripts and not expected to change.
113# https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
114_ISC_VIRAMA_SET = frozenset((
115 0x094D, # DEVANAGARI SIGN VIRAMA
116 0x09CD, # BENGALI SIGN VIRAMA
117 0x0A4D, # GURMUKHI SIGN VIRAMA
118 0x0ACD, # GUJARATI SIGN VIRAMA
119 0x0B4D, # ORIYA SIGN VIRAMA
120 0x0BCD, # TAMIL SIGN VIRAMA
121 0x0C4D, # TELUGU SIGN VIRAMA
122 0x0CCD, # KANNADA SIGN VIRAMA
123 0x0D4D, # MALAYALAM SIGN VIRAMA
124 0x0DCA, # SINHALA SIGN AL-LAKUNA
125 0x1B44, # BALINESE ADEG ADEG
126 0xA806, # SYLOTI NAGRI SIGN HASANTA
127 0xA8C4, # SAURASHTRA SIGN VIRAMA
128 0xA9C0, # JAVANESE PANGKON
129 0x11046, # BRAHMI VIRAMA
130 0x110B9, # KAITHI SIGN VIRAMA
131 0x111C0, # SHARADA SIGN VIRAMA
132 0x11235, # KHOJKI SIGN VIRAMA
133 0x1134D, # GRANTHA SIGN VIRAMA
134 0x11442, # NEWA SIGN VIRAMA
135 0x114C2, # TIRHUTA SIGN VIRAMA
136 0x115BF, # SIDDHAM SIGN VIRAMA
137 0x1163F, # MODI SIGN VIRAMA
138 0x116B6, # TAKRI SIGN VIRAMA
139 0x11839, # DOGRA SIGN VIRAMA
140 0x119E0, # NANDINAGARI SIGN VIRAMA
141 0x11C3F, # BHAIKSUKI SIGN VIRAMA
142))
143_ISC_CONSONANT_TABLE = ISC_CONSONANT
145# In 'parse' mode, strings longer than this are checked for cursor-movement
146# controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to
147# 'ignore' to skip character-by-character parsing. The detection scan cost is
148# negligible for long strings but wasted on short ones like labels or headings.
149_WIDTH_FAST_PATH_MIN_LEN = 20
151# Translation table to strip C0/C1 control characters for fast 'ignore' mode.
152_CONTROL_CHAR_TABLE = str.maketrans('', '', (
153 ''.join(chr(c) for c in range(0x00, 0x20)) + # C0: NUL through US (including tab)
154 '\x7f' + # DEL
155 ''.join(chr(c) for c in range(0x80, 0xa0)) # C1: U+0080-U+009F
156))
158# Unlike wcwidth.__all__, wcwidth.wcwidth.__all__ is NOT for the purpose of defining a public API,
159# or what we prefer to be imported with statement, "from wcwidth.wcwidth import *". Explicitly
160# re-export imports here for no other reason than to satisfy the type checkers (mypy). Yak shavings.
161__all__ = (
162 'ZERO_WIDTH',
163 'WIDE_EASTASIAN',
164 'AMBIGUOUS_EASTASIAN',
165 'VS16_NARROW_TO_WIDE',
166 'list_versions',
167 'wcwidth',
168 'wcswidth',
169 'width',
170 'iter_sequences',
171 'ljust',
172 'rjust',
173 'center',
174 'clip',
175 'strip_sequences',
176 '_wcmatch_version',
177 '_wcversion_value',
178)
181# maxsize=1024: western scripts need ~64 unique codepoints per session, but
182# CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates
183# heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss.
185@lru_cache(maxsize=1024)
186def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument
187 r"""
188 Given one Unicode codepoint, return its printable length on a terminal.
190 :param wc: A single Unicode character.
191 :param unicode_version: Ignored. Retained for backwards compatibility.
193 .. deprecated:: 0.3.0
194 Only the latest Unicode version is now shipped.
196 :param ambiguous_width: Width to use for East Asian Ambiguous (A)
197 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts
198 where ambiguous characters display as double-width. See
199 :ref:`ambiguous_width` for details.
200 :returns: The width, in cells, necessary to display the character of
201 Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has
202 no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is
203 not printable, or has an indeterminate effect on the terminal, such as
204 a control character. Otherwise, the number of column positions the
205 character occupies on a graphic terminal (1 or 2) is returned.
207 See :ref:`Specification` for details of cell measurement.
208 """
209 ucs = ord(wc) if wc else 0
211 # small optimization: early return of 1 for printable ASCII, this provides
212 # approximately 40% performance improvement for mostly-ascii documents, with
213 # less than 1% impact to others.
214 if 32 <= ucs < 0x7f:
215 return 1
217 # C0/C1 control characters are -1 for compatibility with POSIX-like calls
218 if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0:
219 return -1
221 # Zero width
222 if _bisearch(ucs, _ZERO_WIDTH_TABLE):
223 return 0
225 # Wide (F/W categories)
226 if _bisearch(ucs, _WIDE_EASTASIAN_TABLE):
227 return 2
229 # Ambiguous width (A category) - only when ambiguous_width=2
230 if ambiguous_width == 2 and _bisearch(ucs, _AMBIGUOUS_TABLE):
231 return 2
233 return 1
236def wcswidth(
237 pwcs: str,
238 n: int | None = None,
239 unicode_version: str = 'auto',
240 ambiguous_width: int = 1,
241) -> int:
242 """
243 Given a unicode string, return its printable length on a terminal.
245 :param pwcs: Measure width of given unicode string.
246 :param n: When ``n`` is None (default), return the length of the entire
247 string, otherwise only the first ``n`` characters are measured.
249 Better to use string slicing capability, ``wcswidth(pwcs[:n])``, instead,
250 for performance. This argument is a holdover from the POSIX function for
251 matching signatures. Be careful that ``n`` is at grapheme boundaries.
253 :param unicode_version: Ignored. Retained for backwards compatibility.
255 .. deprecated:: 0.3.0
256 Only the latest Unicode version is now shipped.
258 :param ambiguous_width: Width to use for East Asian Ambiguous (A)
259 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
260 :returns: The width, in cells, needed to display the first ``n`` characters
261 of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control
262 characters!
264 See :ref:`Specification` for details of cell measurement.
265 """
266 # pylint: disable=unused-argument,too-many-locals,too-many-statements
267 # pylint: disable=too-complex,too-many-branches
268 # This function intentionally kept long without delegating functions to reduce function calls in
269 # "hot path", the overhead per-character adds up.
271 # Fast path: pure ASCII printable strings are always width == length
272 if n is None and pwcs.isascii() and pwcs.isprintable():
273 return len(pwcs)
275 # Select wcwidth call pattern for best lru_cache performance:
276 # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls
277 # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct)
278 _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width)
280 end = len(pwcs) if n is None else n
281 total_width = 0
282 idx = 0
283 last_measured_idx = -2 # Track index of last measured char for VS16
284 last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check)
285 last_was_virama = False # Virama conjunct formation state
286 conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc)
287 while idx < end:
288 char = pwcs[idx]
289 ucs = ord(char)
290 if ucs == 0x200D:
291 if last_was_virama:
292 # ZWJ after virama requests explicit half-form rendering but
293 # does not change cell count — consume ZWJ only, let the next
294 # consonant be handled by the virama conjunct rule.
295 idx += 1
296 elif idx + 1 < end:
297 # Emoji ZWJ: skip next character unconditionally.
298 idx += 2
299 last_was_virama = False
300 else:
301 idx += 1
302 last_was_virama = False
303 continue
304 if ucs == 0xFE0F and last_measured_idx >= 0:
305 # VS16 following a measured character: add 1 if that character is
306 # known to be converted from narrow to wide by VS16.
307 total_width += _bisearch(ord(pwcs[last_measured_idx]),
308 VS16_NARROW_TO_WIDE["9.0.0"])
309 last_measured_idx = -2 # Prevent double application
310 # VS16 preserves emoji context: last_measured_ucs stays as the base
311 idx += 1
312 continue
313 # Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+)
314 if ucs > 0xFFFF:
315 if ucs in _REGIONAL_INDICATOR_SET:
316 # Lazy RI pairing: count preceding consecutive RIs only when the last one is
317 # received, because RI's are received so rarely its better than per-loop tracking of
318 # 'last char was an RI'.
319 ri_before = 0
320 j = idx - 1
321 while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET:
322 ri_before += 1
323 j -= 1
324 if ri_before % 2 == 1:
325 # Second RI in pair: contributes 0 (pair = one 2-cell flag) using an even-or-odd
326 # check to determine, 'CAUS' would be two flags, but 'CAU' would be 1 flag
327 # and wide 'U'.
328 idx += 1
329 last_measured_ucs = ucs
330 continue
331 # First or unpaired RI: measured normally (width 2 from table)
332 # Fitzpatrick modifier: zero-width when following emoji base
333 elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1]
334 and last_measured_ucs in _EMOJI_ZWJ_SET):
335 idx += 1
336 continue
337 # Virama conjunct formation: consonant following virama contributes 0 width.
338 # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category
339 if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE):
340 last_measured_idx = idx
341 last_measured_ucs = ucs
342 last_was_virama = False
343 conjunct_pending = True
344 idx += 1
345 continue
346 wcw = _wcwidth(char)
347 if wcw < 0:
348 # early return -1 on C0 and C1 control characters
349 return wcw
350 if wcw > 0:
351 if conjunct_pending:
352 total_width += 1
353 conjunct_pending = False
354 last_measured_idx = idx
355 last_measured_ucs = ucs
356 last_was_virama = False
357 elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE):
358 # Spacing Combining Mark (Mc) following a base character adds 1
359 wcw = 1
360 last_measured_idx = -2
361 last_was_virama = False
362 conjunct_pending = False
363 else:
364 last_was_virama = ucs in _ISC_VIRAMA_SET
365 total_width += wcw
366 idx += 1
367 if conjunct_pending:
368 total_width += 1
369 return total_width
372# NOTE: _wcversion_value and _wcmatch_version are no longer used internally
373# by wcwidth since version 0.5.0 (only the latest Unicode version is shipped).
374#
375# They are retained for API compatibility with external tools like ucs-detect
376# that may use these private functions.
379@lru_cache(maxsize=128)
380def _wcversion_value(ver_string: str) -> tuple[int, ...]: # pragma: no cover
381 """
382 Integer-mapped value of given dotted version string.
384 .. deprecated:: 0.3.0
386 This function is no longer used internally by wcwidth but is retained
387 for API compatibility with external tools.
389 :param ver_string: Unicode version string, of form ``n.n.n``.
390 :returns: tuple of digit tuples, ``tuple(int, [...])``.
391 """
392 retval = tuple(map(int, (ver_string.split('.'))))
393 return retval
396@lru_cache(maxsize=8)
397def _wcmatch_version(given_version: str) -> str: # pylint: disable=unused-argument
398 """
399 Return the supported Unicode version level.
401 .. deprecated:: 0.3.0
402 This function now always returns the latest version.
404 This function is no longer used internally by wcwidth but is retained
405 for API compatibility with external tools.
407 :param given_version: Ignored. Any value is accepted for compatibility.
408 :returns: The latest unicode version string.
409 """
410 return _LATEST_VERSION
413def iter_sequences(text: str) -> Iterator[tuple[str, bool]]:
414 r"""
415 Iterate through text, yielding segments with sequence identification.
417 This generator yields tuples of ``(segment, is_sequence)`` for each part
418 of the input text, where ``is_sequence`` is ``True`` if the segment is
419 a recognized terminal escape sequence.
421 :param text: String to iterate through.
422 :returns: Iterator of (segment, is_sequence) tuples.
424 .. versionadded:: 0.3.0
426 Example::
428 >>> list(iter_sequences('hello'))
429 [('hello', False)]
430 >>> list(iter_sequences('\x1b[31mred'))
431 [('\x1b[31m', True), ('red', False)]
432 >>> list(iter_sequences('\x1b[1m\x1b[31m'))
433 [('\x1b[1m', True), ('\x1b[31m', True)]
434 """
435 idx = 0
436 text_len = len(text)
437 segment_start = 0
439 while idx < text_len:
440 char = text[idx]
442 if char == '\x1b':
443 # Yield any accumulated non-sequence text
444 if idx > segment_start:
445 yield (text[segment_start:idx], False)
447 # Try to match an escape sequence
448 match = ZERO_WIDTH_PATTERN.match(text, idx)
449 if match:
450 yield (match.group(), True)
451 idx = match.end()
452 else:
453 # Lone ESC or unrecognized - yield as sequence anyway
454 yield (char, True)
455 idx += 1
456 segment_start = idx
457 else:
458 idx += 1
460 # Yield any remaining text
461 if segment_start < text_len:
462 yield (text[segment_start:], False)
465def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int:
466 """
467 Fast path for width() with control_codes='ignore'.
469 Strips escape sequences and control characters, then measures remaining text.
470 """
471 return wcswidth(
472 strip_sequences(text).translate(_CONTROL_CHAR_TABLE),
473 ambiguous_width=ambiguous_width
474 )
477def width(
478 text: str,
479 *,
480 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
481 tabsize: int = 8,
482 ambiguous_width: int = 1,
483) -> int:
484 r"""
485 Return printable width of text containing many kinds of control codes and sequences.
487 Unlike :func:`wcswidth`, this function handles most control characters and many popular terminal
488 output sequences. Never returns -1.
490 :param text: String to measure.
491 :param control_codes: How to handle control characters and sequences:
493 - ``'parse'`` (default): Track horizontal cursor movement from BS ``\b``, CR ``\r``, TAB
494 ``\t``, and cursor left and right movement sequences. Vertical movement (LF, VT, FF) and
495 indeterminate sequences are zero-width. Never raises.
496 - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with
497 indeterminate results of the screen or cursor, like clear or vertical movement. Generally,
498 these should be handled with a virtual terminal emulator (like 'pyte').
499 - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as
500 width 0. This is the fastest measurement for text already filtered or known not to contain
501 any kinds of control codes or sequences. TAB ``\t`` is zero-width; for tab expansion,
502 pre-process: ``text.replace('\t', ' ' * 8)``.
504 :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8.
505 Must be positive. Has no effect when ``control_codes='ignore'``.
506 :param ambiguous_width: Width to use for East Asian Ambiguous (A)
507 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
508 :returns: Maximum cursor position reached, "extent", accounting for cursor movement sequences
509 present in ``text`` according to given parameters. This represents the rightmost column the
510 cursor reaches. Always a non-negative integer.
512 :raises ValueError: If ``control_codes='strict'`` and control characters with indeterminate
513 effects, such as vertical movement or clear sequences are encountered, or on unexpected
514 C0 or C1 control code. Also raised when ``control_codes`` is not one of the valid values.
516 .. versionadded:: 0.3.0
518 Examples::
520 >>> width('hello')
521 5
522 >>> width('コンニチハ')
523 10
524 >>> width('\x1b[31mred\x1b[0m')
525 3
526 >>> width('\x1b[31mred\x1b[0m', control_codes='ignore') # same result (ignored)
527 3
528 >>> width('123\b4') # backspace overwrites previous cell (outputs '124')
529 3
530 >>> width('abc\t') # tab caused cursor to move to column 8
531 8
532 >>> width('1\x1b[10C') # '1' + cursor right 10, cursor ends on column 11
533 11
534 >>> width('1\x1b[10C', control_codes='ignore') # faster but wrong in this case
535 1
536 """
537 # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals
538 # This could be broken into sub-functions (#1, #3, and 6 especially), but for reduced overhead
539 # considering this function is a likely "hot path", they are inlined, breaking many of our
540 # complexity rules.
542 # Fast path for ASCII printable (no tabs, escapes, or control chars)
543 if text.isascii() and text.isprintable():
544 return len(text)
546 # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode.
547 # Only check for longer strings - the detection overhead hurts short string performance.
548 if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN:
549 # Check for cursor-affecting control characters
550 if '\b' not in text and '\t' not in text and '\r' not in text:
551 # Check for escape sequences - if none, or only non-cursor-movement sequences
552 if '\x1b' not in text or (
553 not CURSOR_RIGHT_SEQUENCE.search(text) and
554 not CURSOR_LEFT_SEQUENCE.search(text)
555 ):
556 control_codes = 'ignore'
558 # Fast path for ignore mode -- this is useful if you know the text is already "clean"
559 if control_codes == 'ignore':
560 return _width_ignored_codes(text, ambiguous_width)
562 strict = control_codes == 'strict'
563 # Track absolute positions: tab stops need modulo on absolute column, CR resets to 0.
564 # Initialize max_extent to 0 so backward movement (CR, BS) won't yield negative width.
565 current_col = 0
566 max_extent = 0
567 idx = 0
568 last_measured_idx = -2 # Track index of last measured char for VS16; -2 can never match idx-1
569 last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check)
570 last_was_virama = False # Virama conjunct formation state
571 conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc)
572 text_len = len(text)
574 # Select wcwidth call pattern for best lru_cache performance:
575 # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls
576 # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct)
577 _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width)
579 while idx < text_len:
580 char = text[idx]
582 # 1. Handle ESC sequences
583 if char == '\x1b':
584 match = ZERO_WIDTH_PATTERN.match(text, idx)
585 if match:
586 seq = match.group()
587 if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq):
588 raise ValueError(f"Indeterminate cursor sequence at position {idx}")
589 # Apply cursor movement
590 right = CURSOR_RIGHT_SEQUENCE.match(seq)
591 if right:
592 current_col += int(right.group(1) or 1)
593 else:
594 left = CURSOR_LEFT_SEQUENCE.match(seq)
595 if left:
596 current_col = max(0, current_col - int(left.group(1) or 1))
597 idx = match.end()
598 else:
599 idx += 1
600 max_extent = max(max_extent, current_col)
601 continue
603 # 2. Handle illegal and vertical control characters (zero width, error in strict)
604 if char in ILLEGAL_CTRL:
605 if strict:
606 raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}")
607 idx += 1
608 continue
610 if char in VERTICAL_CTRL:
611 if strict:
612 raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}")
613 idx += 1
614 continue
616 # 3. Handle horizontal movement characters
617 if char in HORIZONTAL_CTRL:
618 if char == '\x09' and tabsize > 0: # Tab
619 current_col += tabsize - (current_col % tabsize)
620 elif char == '\x08': # Backspace
621 if current_col > 0:
622 current_col -= 1
623 elif char == '\x0d': # Carriage return
624 current_col = 0
625 max_extent = max(max_extent, current_col)
626 idx += 1
627 continue
629 # 4. Handle ZWJ
630 if char == '\u200D':
631 if last_was_virama:
632 # ZWJ after virama requests explicit half-form rendering but
633 # does not change cell count — consume ZWJ only, let the next
634 # consonant be handled by the virama conjunct rule.
635 idx += 1
636 elif idx + 1 < text_len:
637 # Emoji ZWJ: skip next character unconditionally.
638 idx += 2
639 last_was_virama = False
640 else:
641 idx += 1
642 last_was_virama = False
643 continue
645 # 5. Handle other zero-width characters (control chars)
646 if char in ZERO_WIDTH_CTRL:
647 idx += 1
648 continue
650 ucs = ord(char)
652 # 6. Handle VS16: converts preceding narrow character to wide
653 if ucs == 0xFE0F:
654 if last_measured_idx == idx - 1:
655 if _bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]):
656 current_col += 1
657 max_extent = max(max_extent, current_col)
658 # VS16 preserves emoji context: last_measured_ucs stays as the base
659 idx += 1
660 continue
662 # 6b. Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+)
663 if ucs > 0xFFFF:
664 if ucs in _REGIONAL_INDICATOR_SET:
665 # Lazy RI pairing: count preceding consecutive RIs
666 ri_before = 0
667 j = idx - 1
668 while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET:
669 ri_before += 1
670 j -= 1
671 if ri_before % 2 == 1:
672 last_measured_ucs = ucs
673 idx += 1
674 continue
675 # 6c. Fitzpatrick modifier: zero-width when following emoji base
676 elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1]
677 and last_measured_ucs in _EMOJI_ZWJ_SET):
678 idx += 1
679 continue
681 # 7. Virama conjunct formation: consonant following virama contributes 0 width.
682 # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category
683 if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE):
684 last_measured_idx = idx
685 last_measured_ucs = ucs
686 last_was_virama = False
687 conjunct_pending = True
688 idx += 1
689 continue
691 # 8. Normal characters: measure with wcwidth
692 w = _wcwidth(char)
693 if w > 0:
694 if conjunct_pending:
695 current_col += 1
696 conjunct_pending = False
697 current_col += w
698 max_extent = max(max_extent, current_col)
699 last_measured_idx = idx
700 last_measured_ucs = ucs
701 last_was_virama = False
702 elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE):
703 # Spacing Combining Mark (Mc) following a base character adds 1
704 current_col += 1
705 max_extent = max(max_extent, current_col)
706 last_measured_idx = -2
707 last_was_virama = False
708 conjunct_pending = False
709 else:
710 last_was_virama = ucs in _ISC_VIRAMA_SET
711 idx += 1
713 if conjunct_pending:
714 current_col += 1
715 max_extent = max(max_extent, current_col)
716 return max_extent
719def ljust(
720 text: str,
721 dest_width: int,
722 fillchar: str = ' ',
723 *,
724 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
725 ambiguous_width: int = 1,
726) -> str:
727 r"""
728 Return text left-justified in a string of given display width.
730 :param text: String to justify, may contain terminal sequences.
731 :param dest_width: Total display width of result in terminal cells.
732 :param fillchar: Single character for padding (default space). Must have
733 display width of 1 (not wide, not zero-width, not combining). Unicode
734 characters like ``'·'`` are acceptable. The width is not validated.
735 :param control_codes: How to handle control sequences when measuring.
736 Passed to :func:`width` for measurement.
737 :param ambiguous_width: Width to use for East Asian Ambiguous (A)
738 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
739 :returns: Text padded on the right to reach ``dest_width``.
741 .. versionadded:: 0.3.0
743 Example::
745 >>> wcwidth.ljust('hi', 5)
746 'hi '
747 >>> wcwidth.ljust('\x1b[31mhi\x1b[0m', 5)
748 '\x1b[31mhi\x1b[0m '
749 >>> wcwidth.ljust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6)
750 '👨👩👧 '
751 """
752 if text.isascii() and text.isprintable():
753 text_width = len(text)
754 else:
755 text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width)
756 padding_cells = max(0, dest_width - text_width)
757 return text + fillchar * padding_cells
760def rjust(
761 text: str,
762 dest_width: int,
763 fillchar: str = ' ',
764 *,
765 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
766 ambiguous_width: int = 1,
767) -> str:
768 r"""
769 Return text right-justified in a string of given display width.
771 :param text: String to justify, may contain terminal sequences.
772 :param dest_width: Total display width of result in terminal cells.
773 :param fillchar: Single character for padding (default space). Must have
774 display width of 1 (not wide, not zero-width, not combining). Unicode
775 characters like ``'·'`` are acceptable. The width is not validated.
776 :param control_codes: How to handle control sequences when measuring.
777 Passed to :func:`width` for measurement.
778 :param ambiguous_width: Width to use for East Asian Ambiguous (A)
779 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
780 :returns: Text padded on the left to reach ``dest_width``.
782 .. versionadded:: 0.3.0
784 Example::
786 >>> wcwidth.rjust('hi', 5)
787 ' hi'
788 >>> wcwidth.rjust('\x1b[31mhi\x1b[0m', 5)
789 ' \x1b[31mhi\x1b[0m'
790 >>> wcwidth.rjust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6)
791 ' 👨👩👧'
792 """
793 if text.isascii() and text.isprintable():
794 text_width = len(text)
795 else:
796 text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width)
797 padding_cells = max(0, dest_width - text_width)
798 return fillchar * padding_cells + text
801def center(
802 text: str,
803 dest_width: int,
804 fillchar: str = ' ',
805 *,
806 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
807 ambiguous_width: int = 1,
808) -> str:
809 r"""
810 Return text centered in a string of given display width.
812 :param text: String to center, may contain terminal sequences.
813 :param dest_width: Total display width of result in terminal cells.
814 :param fillchar: Single character for padding (default space). Must have
815 display width of 1 (not wide, not zero-width, not combining). Unicode
816 characters like ``'·'`` are acceptable. The width is not validated.
817 :param control_codes: How to handle control sequences when measuring.
818 Passed to :func:`width` for measurement.
819 :param ambiguous_width: Width to use for East Asian Ambiguous (A)
820 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
821 :returns: Text padded on both sides to reach ``dest_width``.
823 For odd-width padding, the extra cell goes on the right (matching
824 Python's :meth:`str.center` behavior).
826 .. versionadded:: 0.3.0
828 Example::
830 >>> wcwidth.center('hi', 6)
831 ' hi '
832 >>> wcwidth.center('\x1b[31mhi\x1b[0m', 6)
833 ' \x1b[31mhi\x1b[0m '
834 >>> wcwidth.center('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6)
835 ' 👨👩👧 '
836 """
837 if text.isascii() and text.isprintable():
838 text_width = len(text)
839 else:
840 text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width)
841 total_padding = max(0, dest_width - text_width)
842 # matching https://jazcap53.github.io/pythons-eccentric-strcenter.html
843 left_pad = total_padding // 2 + (total_padding & dest_width & 1)
844 right_pad = total_padding - left_pad
845 return fillchar * left_pad + text + fillchar * right_pad
848def strip_sequences(text: str) -> str:
849 r"""
850 Return text with all terminal escape sequences removed.
852 Unknown or incomplete ESC sequences are preserved.
854 :param text: String that may contain terminal escape sequences.
855 :returns: The input text with all escape sequences stripped.
857 .. versionadded:: 0.3.0
859 Example::
861 >>> strip_sequences('\x1b[31mred\x1b[0m')
862 'red'
863 >>> strip_sequences('hello')
864 'hello'
865 >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text')
866 'bold red text'
867 """
868 return ZERO_WIDTH_PATTERN.sub('', text)
871def clip(
872 text: str,
873 start: int,
874 end: int,
875 *,
876 fillchar: str = ' ',
877 tabsize: int = 8,
878 ambiguous_width: int = 1,
879 propagate_sgr: bool = True,
880) -> str:
881 r"""
882 Clip text to display columns ``(start, end)`` while preserving all terminal sequences.
884 This function extracts a substring based on visible column positions rather than
885 character indices. Terminal escape sequences are preserved in the output since
886 they have zero display width. If a wide character (width 2) would be split at
887 either boundary, it is replaced with ``fillchar``.
889 TAB characters (``\t``) are expanded to spaces up to the next tab stop,
890 controlled by the ``tabsize`` parameter.
892 Other cursor movement characters (backspace, carriage return) and cursor
893 movement sequences are passed through unchanged as zero-width.
895 :param text: String to clip, may contain terminal escape sequences.
896 :param start: Absolute starting column (inclusive, 0-indexed).
897 :param end: Absolute ending column (exclusive).
898 :param fillchar: Character to use when a wide character must be split at
899 a boundary (default space). Must have display width of 1.
900 :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through
901 as zero-width (preserved in output but don't advance column position).
902 :param ambiguous_width: Width to use for East Asian Ambiguous (A)
903 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
904 :param propagate_sgr: If True (default), SGR (terminal styling) sequences
905 are propagated. The result begins with any active style at the start
906 position and ends with a reset sequence if styles are active.
907 :returns: Substring of ``text`` spanning display columns ``(start, end)``,
908 with all terminal sequences preserved and wide characters at boundaries
909 replaced with ``fillchar``.
911 SGR (terminal styling) sequences are propagated by default. The result
912 begins with any active style and ends with a reset::
914 >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11)
915 '\x1b[1;34mworld\x1b[0m'
917 Set ``propagate_sgr=False`` to disable this behavior.
919 .. versionadded:: 0.3.0
921 .. versionchanged:: 0.5.0
922 Added ``propagate_sgr`` parameter (default True).
924 Example::
926 >>> clip('hello world', 0, 5)
927 'hello'
928 >>> clip('中文字', 0, 3) # Wide char split at column 3
929 '中 '
930 >>> clip('a\tb', 0, 10) # Tab expanded to spaces
931 'a b'
932 """
933 # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks
934 # Again, for 'hot path', we avoid additional delegate functions and accept the cost
935 # of complexity for improved python performance.
936 start = max(start, 0)
937 if end <= start:
938 return ''
940 # Fast path: printable ASCII only (no tabs, escape sequences, or wide or zero-width chars)
941 if text.isascii() and text.isprintable():
942 return text[start:end]
944 # Fast path: no escape sequences means no SGR tracking needed
945 if propagate_sgr and '\x1b' not in text:
946 propagate_sgr = False
948 # SGR tracking state (only when propagate_sgr=True)
949 sgr_at_clip_start = None # state when first visible char emitted (None = not yet)
950 if propagate_sgr:
951 sgr = _SGR_STATE_DEFAULT # current SGR state, updated by all sequences
953 output: list[str] = []
954 col = 0
955 idx = 0
957 while idx < len(text):
958 char = text[idx]
960 # Early exit: past visible region, SGR captured, no escape ahead
961 if col >= end and sgr_at_clip_start is not None and char != '\x1b':
962 break
964 # Handle escape sequences
965 if char == '\x1b' and (match := ZERO_WIDTH_PATTERN.match(text, idx)):
966 seq = match.group()
967 if propagate_sgr and _SGR_PATTERN.match(seq):
968 # Update SGR state; will be applied as prefix when visible content starts
969 sgr = _sgr_state_update(sgr, seq)
970 else:
971 # Non-SGR sequences always preserved
972 output.append(seq)
973 idx = match.end()
974 continue
976 # Handle bare ESC (not a valid sequence)
977 if char == '\x1b':
978 output.append(char)
979 idx += 1
980 continue
982 # TAB expansion
983 if char == '\t':
984 if tabsize > 0:
985 next_tab = col + (tabsize - (col % tabsize))
986 while col < next_tab:
987 if start <= col < end:
988 output.append(' ')
989 if propagate_sgr and sgr_at_clip_start is None:
990 sgr_at_clip_start = sgr
991 col += 1
992 else:
993 output.append(char)
994 idx += 1
995 continue
997 # Grapheme clustering for everything else
998 grapheme = next(iter_graphemes(text, start=idx))
999 w = width(grapheme, ambiguous_width=ambiguous_width)
1001 if w == 0:
1002 if start <= col < end:
1003 output.append(grapheme)
1004 elif col >= start and col + w <= end:
1005 # Fully visible
1006 output.append(grapheme)
1007 if propagate_sgr and sgr_at_clip_start is None:
1008 sgr_at_clip_start = sgr
1009 col += w
1010 elif col < end and col + w > start:
1011 # Partially visible (wide char at boundary)
1012 output.append(fillchar * (min(end, col + w) - max(start, col)))
1013 if propagate_sgr and sgr_at_clip_start is None:
1014 sgr_at_clip_start = sgr
1015 col += w
1016 else:
1017 col += w
1019 idx += len(grapheme)
1021 result = ''.join(output)
1023 # Apply SGR prefix/suffix
1024 if sgr_at_clip_start is not None:
1025 if prefix := _sgr_state_to_sequence(sgr_at_clip_start):
1026 result = prefix + result
1027 if _sgr_state_is_active(sgr_at_clip_start):
1028 result += '\x1b[0m'
1030 return result