Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/wcwidth.py: 12%

1"""

2This is a python implementation of wcwidth() and wcswidth().

4https://github.com/jquast/wcwidth

6from Markus Kuhn's C code, retrieved from:

8 http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c

10This is an implementation of wcwidth() and wcswidth() (defined in

11IEEE Std 1002.1-2001) for Unicode.

13http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html

14http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html

16In fixed-width output devices, Latin characters all occupy a single

17"cell" position of equal width, whereas ideographic CJK characters

18occupy two such cells. Interoperability between terminal-line

19applications and (teletype-style) character terminals using the

20UTF-8 encoding requires agreement on which character should advance

21the cursor by how many cell positions. No established formal

22standards exist at present on which Unicode character shall occupy

23how many cell positions on character terminals. These routines are

24a first attempt of defining such behavior based on simple rules

25applied to data provided by the Unicode Consortium.

27For some graphical characters, the Unicode standard explicitly

28defines a character-cell width via the definition of the East Asian

29FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.

30In all these cases, there is no ambiguity about which width a

31terminal shall use. For characters in the East Asian Ambiguous (A)

32class, the width choice depends purely on a preference of backward

33compatibility with either historic CJK or Western practice.

34Choosing single-width for these characters is easy to justify as

35the appropriate long-term solution, as the CJK practice of

36displaying these characters as double-width comes from historic

37implementation simplicity (8-bit encoded characters were displayed

38single-width and 16-bit ones double-width, even for Greek,

39Cyrillic, etc.) and not any typographic considerations.

41Much less clear is the choice of width for the Not East Asian

42(Neutral) class. Existing practice does not dictate a width for any

43of these characters. It would nevertheless make sense

44typographically to allocate two character cells to characters such

45as for instance EM SPACE or VOLUME INTEGRAL, which cannot be

46represented adequately with a single-width glyph. The following

47routines at present merely assign a single-cell width to all

48neutral characters, in the interest of simplicity. This is not

49entirely satisfactory and should be reconsidered before

50establishing a formal standard in this area. At the moment, the

51decision which Not East Asian (Neutral) characters should be

52represented by double-width glyphs cannot yet be answered by

53applying a simple rule from the Unicode database content. Setting

54up a proper standard for the behavior of UTF-8 character terminals

55will require a careful analysis not only of each Unicode character,

56but also of each presentation form, something the author of these

57routines has avoided to do so far.

59http://www.unicode.org/unicode/reports/tr11/

61Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c

62"""

64from __future__ import annotations

66# std imports

67from functools import lru_cache

69from typing import TYPE_CHECKING

71# local

72from .bisearch import bisearch as _bisearch

73from .grapheme import iter_graphemes

74from .table_mc import CATEGORY_MC

75from .sgr_state import (_SGR_PATTERN,

76 _SGR_STATE_DEFAULT,

77 _sgr_state_update,

78 _sgr_state_is_active,

79 _sgr_state_to_sequence)

80from .table_vs16 import VS16_NARROW_TO_WIDE

81from .table_wide import WIDE_EASTASIAN

82from .table_zero import ZERO_WIDTH

83from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL

84from .table_grapheme import ISC_CONSONANT, EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR

85from .table_ambiguous import AMBIGUOUS_EASTASIAN

86from .escape_sequences import (ZERO_WIDTH_PATTERN,

87 CURSOR_LEFT_SEQUENCE,

88 CURSOR_RIGHT_SEQUENCE,

89 INDETERMINATE_EFFECT_SEQUENCE)

90from .unicode_versions import list_versions

92if TYPE_CHECKING: # pragma: no cover

93 # std imports

94 from collections.abc import Iterator

96 from typing import Literal

98# Pre-compute table references for the latest (and only) Unicode version.

99_LATEST_VERSION = list_versions()[-1]

100_ZERO_WIDTH_TABLE = ZERO_WIDTH[_LATEST_VERSION]

101_WIDE_EASTASIAN_TABLE = WIDE_EASTASIAN[_LATEST_VERSION]

102_AMBIGUOUS_TABLE = AMBIGUOUS_EASTASIAN[next(iter(AMBIGUOUS_EASTASIAN))]

103_CATEGORY_MC_TABLE = CATEGORY_MC[_LATEST_VERSION]

104_REGIONAL_INDICATOR_SET = frozenset(

105 range(GRAPHEME_REGIONAL_INDICATOR[0][0], GRAPHEME_REGIONAL_INDICATOR[0][1] + 1)

106)

107_EMOJI_ZWJ_SET = frozenset(

108 cp for lo, hi in EXTENDED_PICTOGRAPHIC for cp in range(lo, hi + 1)

109) | _REGIONAL_INDICATOR_SET

110_FITZPATRICK_RANGE = (0x1F3FB, 0x1F3FF)

111# Indic_Syllabic_Category=Virama codepoints, from IndicSyllabicCategory.txt.

112# These are structurally tied to their scripts and not expected to change.

113# https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt

114_ISC_VIRAMA_SET = frozenset((

115 0x094D, # DEVANAGARI SIGN VIRAMA

116 0x09CD, # BENGALI SIGN VIRAMA

117 0x0A4D, # GURMUKHI SIGN VIRAMA

118 0x0ACD, # GUJARATI SIGN VIRAMA

119 0x0B4D, # ORIYA SIGN VIRAMA

120 0x0BCD, # TAMIL SIGN VIRAMA

121 0x0C4D, # TELUGU SIGN VIRAMA

122 0x0CCD, # KANNADA SIGN VIRAMA

123 0x0D4D, # MALAYALAM SIGN VIRAMA

124 0x0DCA, # SINHALA SIGN AL-LAKUNA

125 0x1B44, # BALINESE ADEG ADEG

126 0xA806, # SYLOTI NAGRI SIGN HASANTA

127 0xA8C4, # SAURASHTRA SIGN VIRAMA

128 0xA9C0, # JAVANESE PANGKON

129 0x11046, # BRAHMI VIRAMA

130 0x110B9, # KAITHI SIGN VIRAMA

131 0x111C0, # SHARADA SIGN VIRAMA

132 0x11235, # KHOJKI SIGN VIRAMA

133 0x1134D, # GRANTHA SIGN VIRAMA

134 0x11442, # NEWA SIGN VIRAMA

135 0x114C2, # TIRHUTA SIGN VIRAMA

136 0x115BF, # SIDDHAM SIGN VIRAMA

137 0x1163F, # MODI SIGN VIRAMA

138 0x116B6, # TAKRI SIGN VIRAMA

139 0x11839, # DOGRA SIGN VIRAMA

140 0x119E0, # NANDINAGARI SIGN VIRAMA

141 0x11C3F, # BHAIKSUKI SIGN VIRAMA

142))

143_ISC_CONSONANT_TABLE = ISC_CONSONANT

144

145# In 'parse' mode, strings longer than this are checked for cursor-movement

146# controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to

147# 'ignore' to skip character-by-character parsing. The detection scan cost is

148# negligible for long strings but wasted on short ones like labels or headings.

149_WIDTH_FAST_PATH_MIN_LEN = 20

150

151# Translation table to strip C0/C1 control characters for fast 'ignore' mode.

152_CONTROL_CHAR_TABLE = str.maketrans('', '', (

153 ''.join(chr(c) for c in range(0x00, 0x20)) + # C0: NUL through US (including tab)

154 '\x7f' + # DEL

155 ''.join(chr(c) for c in range(0x80, 0xa0)) # C1: U+0080-U+009F

156))

157

158# Unlike wcwidth.__all__, wcwidth.wcwidth.__all__ is NOT for the purpose of defining a public API,

159# or what we prefer to be imported with statement, "from wcwidth.wcwidth import *". Explicitly

160# re-export imports here for no other reason than to satisfy the type checkers (mypy). Yak shavings.

161__all__ = (

162 'ZERO_WIDTH',

163 'WIDE_EASTASIAN',

164 'AMBIGUOUS_EASTASIAN',

165 'VS16_NARROW_TO_WIDE',

166 'list_versions',

167 'wcwidth',

168 'wcswidth',

169 'width',

170 'iter_sequences',

171 'ljust',

172 'rjust',

173 'center',

174 'clip',

175 'strip_sequences',

176 '_wcmatch_version',

177 '_wcversion_value',

178)

179

180

181# maxsize=1024: western scripts need ~64 unique codepoints per session, but

182# CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates

183# heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss.

184

185@lru_cache(maxsize=1024)

186def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument

187 r"""

188 Given one Unicode codepoint, return its printable length on a terminal.

189

190 :param wc: A single Unicode character.

191 :param unicode_version: Ignored. Retained for backwards compatibility.

192

193 .. deprecated:: 0.3.0

194 Only the latest Unicode version is now shipped.

195

196 :param ambiguous_width: Width to use for East Asian Ambiguous (A)

197 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts

198 where ambiguous characters display as double-width. See

199 :ref:`ambiguous_width` for details.

200 :returns: The width, in cells, necessary to display the character of

201 Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has

202 no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is

203 not printable, or has an indeterminate effect on the terminal, such as

204 a control character. Otherwise, the number of column positions the

205 character occupies on a graphic terminal (1 or 2) is returned.

206

207 See :ref:`Specification` for details of cell measurement.

208 """

209 ucs = ord(wc) if wc else 0

210

211 # small optimization: early return of 1 for printable ASCII, this provides

212 # approximately 40% performance improvement for mostly-ascii documents, with

213 # less than 1% impact to others.

214 if 32 <= ucs < 0x7f:

215 return 1

216

217 # C0/C1 control characters are -1 for compatibility with POSIX-like calls

218 if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0:

219 return -1

220

221 # Zero width

222 if _bisearch(ucs, _ZERO_WIDTH_TABLE):

223 return 0

224

225 # Wide (F/W categories)

226 if _bisearch(ucs, _WIDE_EASTASIAN_TABLE):

227 return 2

228

229 # Ambiguous width (A category) - only when ambiguous_width=2

230 if ambiguous_width == 2 and _bisearch(ucs, _AMBIGUOUS_TABLE):

231 return 2

232

233 return 1

234

235

236def wcswidth(

237 pwcs: str,

238 n: int | None = None,

239 unicode_version: str = 'auto',

240 ambiguous_width: int = 1,

241) -> int:

242 """

243 Given a unicode string, return its printable length on a terminal.

244

245 :param pwcs: Measure width of given unicode string.

246 :param n: When ``n`` is None (default), return the length of the entire

247 string, otherwise only the first ``n`` characters are measured.

248

249 Better to use string slicing capability, ``wcswidth(pwcs[:n])``, instead,

250 for performance. This argument is a holdover from the POSIX function for

251 matching signatures. Be careful that ``n`` is at grapheme boundaries.

252

253 :param unicode_version: Ignored. Retained for backwards compatibility.

254

255 .. deprecated:: 0.3.0

256 Only the latest Unicode version is now shipped.

257

258 :param ambiguous_width: Width to use for East Asian Ambiguous (A)

259 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.

260 :returns: The width, in cells, needed to display the first ``n`` characters

261 of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control

262 characters!

263

264 See :ref:`Specification` for details of cell measurement.

265 """

266 # pylint: disable=unused-argument,too-many-locals,too-many-statements

267 # pylint: disable=too-complex,too-many-branches

268 # This function intentionally kept long without delegating functions to reduce function calls in

269 # "hot path", the overhead per-character adds up.

270

271 # Fast path: pure ASCII printable strings are always width == length

272 if n is None and pwcs.isascii() and pwcs.isprintable():

273 return len(pwcs)

274

275 # Select wcwidth call pattern for best lru_cache performance:

276 # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls

277 # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct)

278 _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width)

279

280 end = len(pwcs) if n is None else n

281 total_width = 0

282 idx = 0

283 last_measured_idx = -2 # Track index of last measured char for VS16

284 last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check)

285 last_was_virama = False # Virama conjunct formation state

286 conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc)

287 while idx < end:

288 char = pwcs[idx]

289 ucs = ord(char)

290 if ucs == 0x200D:

291 if last_was_virama:

292 # ZWJ after virama requests explicit half-form rendering but

293 # does not change cell count — consume ZWJ only, let the next

294 # consonant be handled by the virama conjunct rule.

295 idx += 1

296 elif idx + 1 < end:

297 # Emoji ZWJ: skip next character unconditionally.

298 idx += 2

299 last_was_virama = False

300 else:

301 idx += 1

302 last_was_virama = False

303 continue

304 if ucs == 0xFE0F and last_measured_idx >= 0:

305 # VS16 following a measured character: add 1 if that character is

306 # known to be converted from narrow to wide by VS16.

307 total_width += _bisearch(ord(pwcs[last_measured_idx]),

308 VS16_NARROW_TO_WIDE["9.0.0"])

309 last_measured_idx = -2 # Prevent double application

310 # VS16 preserves emoji context: last_measured_ucs stays as the base

311 idx += 1

312 continue

313 # Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+)

314 if ucs > 0xFFFF:

315 if ucs in _REGIONAL_INDICATOR_SET:

316 # Lazy RI pairing: count preceding consecutive RIs only when the last one is

317 # received, because RI's are received so rarely its better than per-loop tracking of

318 # 'last char was an RI'.

319 ri_before = 0

320 j = idx - 1

321 while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET:

322 ri_before += 1

323 j -= 1

324 if ri_before % 2 == 1:

325 # Second RI in pair: contributes 0 (pair = one 2-cell flag) using an even-or-odd

326 # check to determine, 'CAUS' would be two flags, but 'CAU' would be 1 flag

327 # and wide 'U'.

328 idx += 1

329 last_measured_ucs = ucs

330 continue

331 # First or unpaired RI: measured normally (width 2 from table)

332 # Fitzpatrick modifier: zero-width when following emoji base

333 elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1]

334 and last_measured_ucs in _EMOJI_ZWJ_SET):

335 idx += 1

336 continue

337 # Virama conjunct formation: consonant following virama contributes 0 width.

338 # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category

339 if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE):

340 last_measured_idx = idx

341 last_measured_ucs = ucs

342 last_was_virama = False

343 conjunct_pending = True

344 idx += 1

345 continue

346 wcw = _wcwidth(char)

347 if wcw < 0:

348 # early return -1 on C0 and C1 control characters

349 return wcw

350 if wcw > 0:

351 if conjunct_pending:

352 total_width += 1

353 conjunct_pending = False

354 last_measured_idx = idx

355 last_measured_ucs = ucs

356 last_was_virama = False

357 elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE):

358 # Spacing Combining Mark (Mc) following a base character adds 1

359 wcw = 1

360 last_measured_idx = -2

361 last_was_virama = False

362 conjunct_pending = False

363 else:

364 last_was_virama = ucs in _ISC_VIRAMA_SET

365 total_width += wcw

366 idx += 1

367 if conjunct_pending:

368 total_width += 1

369 return total_width

370

371

372# NOTE: _wcversion_value and _wcmatch_version are no longer used internally

373# by wcwidth since version 0.5.0 (only the latest Unicode version is shipped).

374#

375# They are retained for API compatibility with external tools like ucs-detect

376# that may use these private functions.

377

378

379@lru_cache(maxsize=128)

380def _wcversion_value(ver_string: str) -> tuple[int, ...]: # pragma: no cover

381 """

382 Integer-mapped value of given dotted version string.

383

384 .. deprecated:: 0.3.0

385

386 This function is no longer used internally by wcwidth but is retained

387 for API compatibility with external tools.

388

389 :param ver_string: Unicode version string, of form ``n.n.n``.

390 :returns: tuple of digit tuples, ``tuple(int, [...])``.

391 """

392 retval = tuple(map(int, (ver_string.split('.'))))

393 return retval

394

395

396@lru_cache(maxsize=8)

397def _wcmatch_version(given_version: str) -> str: # pylint: disable=unused-argument

398 """

399 Return the supported Unicode version level.

400

401 .. deprecated:: 0.3.0

402 This function now always returns the latest version.

403

404 This function is no longer used internally by wcwidth but is retained

405 for API compatibility with external tools.

406

407 :param given_version: Ignored. Any value is accepted for compatibility.

408 :returns: The latest unicode version string.

409 """

410 return _LATEST_VERSION

411

412

413def iter_sequences(text: str) -> Iterator[tuple[str, bool]]:

414 r"""

415 Iterate through text, yielding segments with sequence identification.

416

417 This generator yields tuples of ``(segment, is_sequence)`` for each part

418 of the input text, where ``is_sequence`` is ``True`` if the segment is

419 a recognized terminal escape sequence.

420

421 :param text: String to iterate through.

422 :returns: Iterator of (segment, is_sequence) tuples.

423

424 .. versionadded:: 0.3.0

425

426 Example::

427

428 >>> list(iter_sequences('hello'))

429 [('hello', False)]

430 >>> list(iter_sequences('\x1b[31mred'))

431 [('\x1b[31m', True), ('red', False)]

432 >>> list(iter_sequences('\x1b[1m\x1b[31m'))

433 [('\x1b[1m', True), ('\x1b[31m', True)]

434 """

435 idx = 0

436 text_len = len(text)

437 segment_start = 0

438

439 while idx < text_len:

440 char = text[idx]

441

442 if char == '\x1b':

443 # Yield any accumulated non-sequence text

444 if idx > segment_start:

445 yield (text[segment_start:idx], False)

446

447 # Try to match an escape sequence

448 match = ZERO_WIDTH_PATTERN.match(text, idx)

449 if match:

450 yield (match.group(), True)

451 idx = match.end()

452 else:

453 # Lone ESC or unrecognized - yield as sequence anyway

454 yield (char, True)

455 idx += 1

456 segment_start = idx

457 else:

458 idx += 1

459

460 # Yield any remaining text

461 if segment_start < text_len:

462 yield (text[segment_start:], False)

463

464

465def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int:

466 """

467 Fast path for width() with control_codes='ignore'.

468

469 Strips escape sequences and control characters, then measures remaining text.

470 """

471 return wcswidth(

472 strip_sequences(text).translate(_CONTROL_CHAR_TABLE),

473 ambiguous_width=ambiguous_width

474 )

475

476

477def width(

478 text: str,

479 *,

480 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',

481 tabsize: int = 8,

482 ambiguous_width: int = 1,

483) -> int:

484 r"""

485 Return printable width of text containing many kinds of control codes and sequences.

486

487 Unlike :func:`wcswidth`, this function handles most control characters and many popular terminal

488 output sequences. Never returns -1.

489

490 :param text: String to measure.

491 :param control_codes: How to handle control characters and sequences:

492

493 - ``'parse'`` (default): Track horizontal cursor movement from BS ``\b``, CR ``\r``, TAB

494 ``\t``, and cursor left and right movement sequences. Vertical movement (LF, VT, FF) and

495 indeterminate sequences are zero-width. Never raises.

496 - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with

497 indeterminate results of the screen or cursor, like clear or vertical movement. Generally,

498 these should be handled with a virtual terminal emulator (like 'pyte').

499 - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as

500 width 0. This is the fastest measurement for text already filtered or known not to contain

501 any kinds of control codes or sequences. TAB ``\t`` is zero-width; for tab expansion,

502 pre-process: ``text.replace('\t', ' ' * 8)``.

503

504 :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8.

505 Must be positive. Has no effect when ``control_codes='ignore'``.

506 :param ambiguous_width: Width to use for East Asian Ambiguous (A)

507 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.

508 :returns: Maximum cursor position reached, "extent", accounting for cursor movement sequences

509 present in ``text`` according to given parameters. This represents the rightmost column the

510 cursor reaches. Always a non-negative integer.

511

512 :raises ValueError: If ``control_codes='strict'`` and control characters with indeterminate

513 effects, such as vertical movement or clear sequences are encountered, or on unexpected

514 C0 or C1 control code. Also raised when ``control_codes`` is not one of the valid values.

515

516 .. versionadded:: 0.3.0

517

518 Examples::

519

520 >>> width('hello')

521 5

522 >>> width('コンニチハ')

523 10

524 >>> width('\x1b[31mred\x1b[0m')

525 3

526 >>> width('\x1b[31mred\x1b[0m', control_codes='ignore') # same result (ignored)

527 3

528 >>> width('123\b4') # backspace overwrites previous cell (outputs '124')

529 3

530 >>> width('abc\t') # tab caused cursor to move to column 8

531 8

532 >>> width('1\x1b[10C') # '1' + cursor right 10, cursor ends on column 11

533 11

534 >>> width('1\x1b[10C', control_codes='ignore') # faster but wrong in this case

535 1

536 """

537 # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals

538 # This could be broken into sub-functions (#1, #3, and 6 especially), but for reduced overhead

539 # considering this function is a likely "hot path", they are inlined, breaking many of our

540 # complexity rules.

541

542 # Fast path for ASCII printable (no tabs, escapes, or control chars)

543 if text.isascii() and text.isprintable():

544 return len(text)

545

546 # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode.

547 # Only check for longer strings - the detection overhead hurts short string performance.

548 if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN:

549 # Check for cursor-affecting control characters

550 if '\b' not in text and '\t' not in text and '\r' not in text:

551 # Check for escape sequences - if none, or only non-cursor-movement sequences

552 if '\x1b' not in text or (

553 not CURSOR_RIGHT_SEQUENCE.search(text) and

554 not CURSOR_LEFT_SEQUENCE.search(text)

555 ):

556 control_codes = 'ignore'

557

558 # Fast path for ignore mode -- this is useful if you know the text is already "clean"

559 if control_codes == 'ignore':

560 return _width_ignored_codes(text, ambiguous_width)

561

562 strict = control_codes == 'strict'

563 # Track absolute positions: tab stops need modulo on absolute column, CR resets to 0.

564 # Initialize max_extent to 0 so backward movement (CR, BS) won't yield negative width.

565 current_col = 0

566 max_extent = 0

567 idx = 0

568 last_measured_idx = -2 # Track index of last measured char for VS16; -2 can never match idx-1

569 last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check)

570 last_was_virama = False # Virama conjunct formation state

571 conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc)

572 text_len = len(text)

573

574 # Select wcwidth call pattern for best lru_cache performance:

575 # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls

576 # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct)

577 _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width)

578

579 while idx < text_len:

580 char = text[idx]

581

582 # 1. Handle ESC sequences

583 if char == '\x1b':

584 match = ZERO_WIDTH_PATTERN.match(text, idx)

585 if match:

586 seq = match.group()

587 if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq):

588 raise ValueError(f"Indeterminate cursor sequence at position {idx}")

589 # Apply cursor movement

590 right = CURSOR_RIGHT_SEQUENCE.match(seq)

591 if right:

592 current_col += int(right.group(1) or 1)

593 else:

594 left = CURSOR_LEFT_SEQUENCE.match(seq)

595 if left:

596 current_col = max(0, current_col - int(left.group(1) or 1))

597 idx = match.end()

598 else:

599 idx += 1

600 max_extent = max(max_extent, current_col)

601 continue

602

603 # 2. Handle illegal and vertical control characters (zero width, error in strict)

604 if char in ILLEGAL_CTRL:

605 if strict:

606 raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}")

607 idx += 1

608 continue

609

610 if char in VERTICAL_CTRL:

611 if strict:

612 raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}")

613 idx += 1

614 continue

615

616 # 3. Handle horizontal movement characters

617 if char in HORIZONTAL_CTRL:

618 if char == '\x09' and tabsize > 0: # Tab

619 current_col += tabsize - (current_col % tabsize)

620 elif char == '\x08': # Backspace

621 if current_col > 0:

622 current_col -= 1

623 elif char == '\x0d': # Carriage return

624 current_col = 0

625 max_extent = max(max_extent, current_col)

626 idx += 1

627 continue

628

629 # 4. Handle ZWJ

630 if char == '\u200D':

631 if last_was_virama:

632 # ZWJ after virama requests explicit half-form rendering but

633 # does not change cell count — consume ZWJ only, let the next

634 # consonant be handled by the virama conjunct rule.

635 idx += 1

636 elif idx + 1 < text_len:

637 # Emoji ZWJ: skip next character unconditionally.

638 idx += 2

639 last_was_virama = False

640 else:

641 idx += 1

642 last_was_virama = False

643 continue

644

645 # 5. Handle other zero-width characters (control chars)

646 if char in ZERO_WIDTH_CTRL:

647 idx += 1

648 continue

649

650 ucs = ord(char)

651

652 # 6. Handle VS16: converts preceding narrow character to wide

653 if ucs == 0xFE0F:

654 if last_measured_idx == idx - 1:

655 if _bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]):

656 current_col += 1

657 max_extent = max(max_extent, current_col)

658 # VS16 preserves emoji context: last_measured_ucs stays as the base

659 idx += 1

660 continue

661

662 # 6b. Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+)

663 if ucs > 0xFFFF:

664 if ucs in _REGIONAL_INDICATOR_SET:

665 # Lazy RI pairing: count preceding consecutive RIs

666 ri_before = 0

667 j = idx - 1

668 while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET:

669 ri_before += 1

670 j -= 1

671 if ri_before % 2 == 1:

672 last_measured_ucs = ucs

673 idx += 1

674 continue

675 # 6c. Fitzpatrick modifier: zero-width when following emoji base

676 elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1]

677 and last_measured_ucs in _EMOJI_ZWJ_SET):

678 idx += 1

679 continue

680

681 # 7. Virama conjunct formation: consonant following virama contributes 0 width.

682 # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category

683 if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE):

684 last_measured_idx = idx

685 last_measured_ucs = ucs

686 last_was_virama = False

687 conjunct_pending = True

688 idx += 1

689 continue

690

691 # 8. Normal characters: measure with wcwidth

692 w = _wcwidth(char)

693 if w > 0:

694 if conjunct_pending:

695 current_col += 1

696 conjunct_pending = False

697 current_col += w

698 max_extent = max(max_extent, current_col)

699 last_measured_idx = idx

700 last_measured_ucs = ucs

701 last_was_virama = False

702 elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE):

703 # Spacing Combining Mark (Mc) following a base character adds 1

704 current_col += 1

705 max_extent = max(max_extent, current_col)

706 last_measured_idx = -2

707 last_was_virama = False

708 conjunct_pending = False

709 else:

710 last_was_virama = ucs in _ISC_VIRAMA_SET

711 idx += 1

712

713 if conjunct_pending:

714 current_col += 1

715 max_extent = max(max_extent, current_col)

716 return max_extent

717

718

719def ljust(

720 text: str,

721 dest_width: int,

722 fillchar: str = ' ',

723 *,

724 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',

725 ambiguous_width: int = 1,

726) -> str:

727 r"""

728 Return text left-justified in a string of given display width.

729

730 :param text: String to justify, may contain terminal sequences.

731 :param dest_width: Total display width of result in terminal cells.

732 :param fillchar: Single character for padding (default space). Must have

733 display width of 1 (not wide, not zero-width, not combining). Unicode

734 characters like ``'·'`` are acceptable. The width is not validated.

735 :param control_codes: How to handle control sequences when measuring.

736 Passed to :func:`width` for measurement.

737 :param ambiguous_width: Width to use for East Asian Ambiguous (A)

738 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.

739 :returns: Text padded on the right to reach ``dest_width``.

740

741 .. versionadded:: 0.3.0

742

743 Example::

744

745 >>> wcwidth.ljust('hi', 5)

746 'hi '

747 >>> wcwidth.ljust('\x1b[31mhi\x1b[0m', 5)

748 '\x1b[31mhi\x1b[0m '

749 >>> wcwidth.ljust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6)

750 '👨‍👩‍👧 '

751 """

752 if text.isascii() and text.isprintable():

753 text_width = len(text)

754 else:

755 text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width)

756 padding_cells = max(0, dest_width - text_width)

757 return text + fillchar * padding_cells

758

759

760def rjust(

761 text: str,

762 dest_width: int,

763 fillchar: str = ' ',

764 *,

765 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',

766 ambiguous_width: int = 1,

767) -> str:

768 r"""

769 Return text right-justified in a string of given display width.

770

771 :param text: String to justify, may contain terminal sequences.

772 :param dest_width: Total display width of result in terminal cells.

773 :param fillchar: Single character for padding (default space). Must have

774 display width of 1 (not wide, not zero-width, not combining). Unicode

775 characters like ``'·'`` are acceptable. The width is not validated.

776 :param control_codes: How to handle control sequences when measuring.

777 Passed to :func:`width` for measurement.

778 :param ambiguous_width: Width to use for East Asian Ambiguous (A)

779 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.

780 :returns: Text padded on the left to reach ``dest_width``.

781

782 .. versionadded:: 0.3.0

783

784 Example::

785

786 >>> wcwidth.rjust('hi', 5)

787 ' hi'

788 >>> wcwidth.rjust('\x1b[31mhi\x1b[0m', 5)

789 ' \x1b[31mhi\x1b[0m'

790 >>> wcwidth.rjust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6)

791 ' 👨‍👩‍👧'

792 """

793 if text.isascii() and text.isprintable():

794 text_width = len(text)

795 else:

796 text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width)

797 padding_cells = max(0, dest_width - text_width)

798 return fillchar * padding_cells + text

799

800

801def center(

802 text: str,

803 dest_width: int,

804 fillchar: str = ' ',

805 *,

806 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',

807 ambiguous_width: int = 1,

808) -> str:

809 r"""

810 Return text centered in a string of given display width.

811

812 :param text: String to center, may contain terminal sequences.

813 :param dest_width: Total display width of result in terminal cells.

814 :param fillchar: Single character for padding (default space). Must have

815 display width of 1 (not wide, not zero-width, not combining). Unicode

816 characters like ``'·'`` are acceptable. The width is not validated.

817 :param control_codes: How to handle control sequences when measuring.

818 Passed to :func:`width` for measurement.

819 :param ambiguous_width: Width to use for East Asian Ambiguous (A)

820 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.

821 :returns: Text padded on both sides to reach ``dest_width``.

822

823 For odd-width padding, the extra cell goes on the right (matching

824 Python's :meth:`str.center` behavior).

825

826 .. versionadded:: 0.3.0

827

828 Example::

829

830 >>> wcwidth.center('hi', 6)

831 ' hi '

832 >>> wcwidth.center('\x1b[31mhi\x1b[0m', 6)

833 ' \x1b[31mhi\x1b[0m '

834 >>> wcwidth.center('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6)

835 ' 👨‍👩‍👧 '

836 """

837 if text.isascii() and text.isprintable():

838 text_width = len(text)

839 else:

840 text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width)

841 total_padding = max(0, dest_width - text_width)

842 # matching https://jazcap53.github.io/pythons-eccentric-strcenter.html

843 left_pad = total_padding // 2 + (total_padding & dest_width & 1)

844 right_pad = total_padding - left_pad

845 return fillchar * left_pad + text + fillchar * right_pad

846

847

848def strip_sequences(text: str) -> str:

849 r"""

850 Return text with all terminal escape sequences removed.

851

852 Unknown or incomplete ESC sequences are preserved.

853

854 :param text: String that may contain terminal escape sequences.

855 :returns: The input text with all escape sequences stripped.

856

857 .. versionadded:: 0.3.0

858

859 Example::

860

861 >>> strip_sequences('\x1b[31mred\x1b[0m')

862 'red'

863 >>> strip_sequences('hello')

864 'hello'

865 >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text')

866 'bold red text'

867 """

868 return ZERO_WIDTH_PATTERN.sub('', text)

869

870

871def clip(

872 text: str,

873 start: int,

874 end: int,

875 *,

876 fillchar: str = ' ',

877 tabsize: int = 8,

878 ambiguous_width: int = 1,

879 propagate_sgr: bool = True,

880) -> str:

881 r"""

882 Clip text to display columns ``(start, end)`` while preserving all terminal sequences.

883

884 This function extracts a substring based on visible column positions rather than

885 character indices. Terminal escape sequences are preserved in the output since

886 they have zero display width. If a wide character (width 2) would be split at

887 either boundary, it is replaced with ``fillchar``.

888

889 TAB characters (``\t``) are expanded to spaces up to the next tab stop,

890 controlled by the ``tabsize`` parameter.

891

892 Other cursor movement characters (backspace, carriage return) and cursor

893 movement sequences are passed through unchanged as zero-width.

894

895 :param text: String to clip, may contain terminal escape sequences.

896 :param start: Absolute starting column (inclusive, 0-indexed).

897 :param end: Absolute ending column (exclusive).

898 :param fillchar: Character to use when a wide character must be split at

899 a boundary (default space). Must have display width of 1.

900 :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through

901 as zero-width (preserved in output but don't advance column position).

902 :param ambiguous_width: Width to use for East Asian Ambiguous (A)

903 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.

904 :param propagate_sgr: If True (default), SGR (terminal styling) sequences

905 are propagated. The result begins with any active style at the start

906 position and ends with a reset sequence if styles are active.

907 :returns: Substring of ``text`` spanning display columns ``(start, end)``,

908 with all terminal sequences preserved and wide characters at boundaries

909 replaced with ``fillchar``.

910

911 SGR (terminal styling) sequences are propagated by default. The result

912 begins with any active style and ends with a reset::

913

914 >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11)

915 '\x1b[1;34mworld\x1b[0m'

916

917 Set ``propagate_sgr=False`` to disable this behavior.

918

919 .. versionadded:: 0.3.0

920

921 .. versionchanged:: 0.5.0

922 Added ``propagate_sgr`` parameter (default True).

923

924 Example::

925

926 >>> clip('hello world', 0, 5)

927 'hello'

928 >>> clip('中文字', 0, 3) # Wide char split at column 3

929 '中 '

930 >>> clip('a\tb', 0, 10) # Tab expanded to spaces

931 'a b'

932 """

933 # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks

934 # Again, for 'hot path', we avoid additional delegate functions and accept the cost

935 # of complexity for improved python performance.

936 start = max(start, 0)

937 if end <= start:

938 return ''

939

940 # Fast path: printable ASCII only (no tabs, escape sequences, or wide or zero-width chars)

941 if text.isascii() and text.isprintable():

942 return text[start:end]

943

944 # Fast path: no escape sequences means no SGR tracking needed

945 if propagate_sgr and '\x1b' not in text:

946 propagate_sgr = False

947

948 # SGR tracking state (only when propagate_sgr=True)

949 sgr_at_clip_start = None # state when first visible char emitted (None = not yet)

950 if propagate_sgr:

951 sgr = _SGR_STATE_DEFAULT # current SGR state, updated by all sequences

952

953 output: list[str] = []

954 col = 0

955 idx = 0

956

957 while idx < len(text):

958 char = text[idx]

959

960 # Early exit: past visible region, SGR captured, no escape ahead

961 if col >= end and sgr_at_clip_start is not None and char != '\x1b':

962 break

963

964 # Handle escape sequences

965 if char == '\x1b' and (match := ZERO_WIDTH_PATTERN.match(text, idx)):

966 seq = match.group()

967 if propagate_sgr and _SGR_PATTERN.match(seq):

968 # Update SGR state; will be applied as prefix when visible content starts

969 sgr = _sgr_state_update(sgr, seq)

970 else:

971 # Non-SGR sequences always preserved

972 output.append(seq)

973 idx = match.end()

974 continue

975

976 # Handle bare ESC (not a valid sequence)

977 if char == '\x1b':

978 output.append(char)

979 idx += 1

980 continue

981

982 # TAB expansion

983 if char == '\t':

984 if tabsize > 0:

985 next_tab = col + (tabsize - (col % tabsize))

986 while col < next_tab:

987 if start <= col < end:

988 output.append(' ')

989 if propagate_sgr and sgr_at_clip_start is None:

990 sgr_at_clip_start = sgr

991 col += 1

992 else:

993 output.append(char)

994 idx += 1

995 continue

996

997 # Grapheme clustering for everything else

998 grapheme = next(iter_graphemes(text, start=idx))

999 w = width(grapheme, ambiguous_width=ambiguous_width)

1000

1001 if w == 0:

1002 if start <= col < end:

1003 output.append(grapheme)

1004 elif col >= start and col + w <= end:

1005 # Fully visible

1006 output.append(grapheme)

1007 if propagate_sgr and sgr_at_clip_start is None:

1008 sgr_at_clip_start = sgr

1009 col += w

1010 elif col < end and col + w > start:

1011 # Partially visible (wide char at boundary)

1012 output.append(fillchar * (min(end, col + w) - max(start, col)))

1013 if propagate_sgr and sgr_at_clip_start is None:

1014 sgr_at_clip_start = sgr

1015 col += w

1016 else:

1017 col += w

1018

1019 idx += len(grapheme)

1020

1021 result = ''.join(output)

1022

1023 # Apply SGR prefix/suffix

1024 if sgr_at_clip_start is not None:

1025 if prefix := _sgr_state_to_sequence(sgr_at_clip_start):

1026 result = prefix + result

1027 if _sgr_state_is_active(sgr_at_clip_start):

1028 result += '\x1b[0m'

1029

1030 return result