Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/wcwidth.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

346 statements  

1""" 

2This is a python implementation of wcwidth() and wcswidth(). 

3 

4https://github.com/jquast/wcwidth 

5 

6from Markus Kuhn's C code, retrieved from: 

7 

8 http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c 

9 

10This is an implementation of wcwidth() and wcswidth() (defined in 

11IEEE Std 1002.1-2001) for Unicode. 

12 

13http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html 

14http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html 

15 

16In fixed-width output devices, Latin characters all occupy a single 

17"cell" position of equal width, whereas ideographic CJK characters 

18occupy two such cells. Interoperability between terminal-line 

19applications and (teletype-style) character terminals using the 

20UTF-8 encoding requires agreement on which character should advance 

21the cursor by how many cell positions. No established formal 

22standards exist at present on which Unicode character shall occupy 

23how many cell positions on character terminals. These routines are 

24a first attempt of defining such behavior based on simple rules 

25applied to data provided by the Unicode Consortium. 

26 

27For some graphical characters, the Unicode standard explicitly 

28defines a character-cell width via the definition of the East Asian 

29FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. 

30In all these cases, there is no ambiguity about which width a 

31terminal shall use. For characters in the East Asian Ambiguous (A) 

32class, the width choice depends purely on a preference of backward 

33compatibility with either historic CJK or Western practice. 

34Choosing single-width for these characters is easy to justify as 

35the appropriate long-term solution, as the CJK practice of 

36displaying these characters as double-width comes from historic 

37implementation simplicity (8-bit encoded characters were displayed 

38single-width and 16-bit ones double-width, even for Greek, 

39Cyrillic, etc.) and not any typographic considerations. 

40 

41Much less clear is the choice of width for the Not East Asian 

42(Neutral) class. Existing practice does not dictate a width for any 

43of these characters. It would nevertheless make sense 

44typographically to allocate two character cells to characters such 

45as for instance EM SPACE or VOLUME INTEGRAL, which cannot be 

46represented adequately with a single-width glyph. The following 

47routines at present merely assign a single-cell width to all 

48neutral characters, in the interest of simplicity. This is not 

49entirely satisfactory and should be reconsidered before 

50establishing a formal standard in this area. At the moment, the 

51decision which Not East Asian (Neutral) characters should be 

52represented by double-width glyphs cannot yet be answered by 

53applying a simple rule from the Unicode database content. Setting 

54up a proper standard for the behavior of UTF-8 character terminals 

55will require a careful analysis not only of each Unicode character, 

56but also of each presentation form, something the author of these 

57routines has avoided to do so far. 

58 

59http://www.unicode.org/unicode/reports/tr11/ 

60 

61Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c 

62""" 

63 

64from __future__ import annotations 

65 

66# std imports 

67from functools import lru_cache 

68 

69from typing import TYPE_CHECKING 

70 

71# local 

72from .bisearch import bisearch as _bisearch 

73from .grapheme import iter_graphemes 

74from .table_mc import CATEGORY_MC 

75from .sgr_state import (_SGR_PATTERN, 

76 _SGR_STATE_DEFAULT, 

77 _sgr_state_update, 

78 _sgr_state_is_active, 

79 _sgr_state_to_sequence) 

80from .table_vs16 import VS16_NARROW_TO_WIDE 

81from .table_wide import WIDE_EASTASIAN 

82from .table_zero import ZERO_WIDTH 

83from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL 

84from .table_grapheme import ISC_CONSONANT, EXTENDED_PICTOGRAPHIC, GRAPHEME_REGIONAL_INDICATOR 

85from .table_ambiguous import AMBIGUOUS_EASTASIAN 

86from .escape_sequences import (ZERO_WIDTH_PATTERN, 

87 CURSOR_LEFT_SEQUENCE, 

88 CURSOR_RIGHT_SEQUENCE, 

89 INDETERMINATE_EFFECT_SEQUENCE) 

90from .unicode_versions import list_versions 

91 

92if TYPE_CHECKING: # pragma: no cover 

93 # std imports 

94 from collections.abc import Iterator 

95 

96 from typing import Literal 

97 

98# Pre-compute table references for the latest (and only) Unicode version. 

99_LATEST_VERSION = list_versions()[-1] 

100_ZERO_WIDTH_TABLE = ZERO_WIDTH[_LATEST_VERSION] 

101_WIDE_EASTASIAN_TABLE = WIDE_EASTASIAN[_LATEST_VERSION] 

102_AMBIGUOUS_TABLE = AMBIGUOUS_EASTASIAN[next(iter(AMBIGUOUS_EASTASIAN))] 

103_CATEGORY_MC_TABLE = CATEGORY_MC[_LATEST_VERSION] 

104_REGIONAL_INDICATOR_SET = frozenset( 

105 range(GRAPHEME_REGIONAL_INDICATOR[0][0], GRAPHEME_REGIONAL_INDICATOR[0][1] + 1) 

106) 

107_EMOJI_ZWJ_SET = frozenset( 

108 cp for lo, hi in EXTENDED_PICTOGRAPHIC for cp in range(lo, hi + 1) 

109) | _REGIONAL_INDICATOR_SET 

110_FITZPATRICK_RANGE = (0x1F3FB, 0x1F3FF) 

111# Indic_Syllabic_Category=Virama codepoints, from IndicSyllabicCategory.txt. 

112# These are structurally tied to their scripts and not expected to change. 

113# https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt 

114_ISC_VIRAMA_SET = frozenset(( 

115 0x094D, # DEVANAGARI SIGN VIRAMA 

116 0x09CD, # BENGALI SIGN VIRAMA 

117 0x0A4D, # GURMUKHI SIGN VIRAMA 

118 0x0ACD, # GUJARATI SIGN VIRAMA 

119 0x0B4D, # ORIYA SIGN VIRAMA 

120 0x0BCD, # TAMIL SIGN VIRAMA 

121 0x0C4D, # TELUGU SIGN VIRAMA 

122 0x0CCD, # KANNADA SIGN VIRAMA 

123 0x0D4D, # MALAYALAM SIGN VIRAMA 

124 0x0DCA, # SINHALA SIGN AL-LAKUNA 

125 0x1B44, # BALINESE ADEG ADEG 

126 0xA806, # SYLOTI NAGRI SIGN HASANTA 

127 0xA8C4, # SAURASHTRA SIGN VIRAMA 

128 0xA9C0, # JAVANESE PANGKON 

129 0x11046, # BRAHMI VIRAMA 

130 0x110B9, # KAITHI SIGN VIRAMA 

131 0x111C0, # SHARADA SIGN VIRAMA 

132 0x11235, # KHOJKI SIGN VIRAMA 

133 0x1134D, # GRANTHA SIGN VIRAMA 

134 0x11442, # NEWA SIGN VIRAMA 

135 0x114C2, # TIRHUTA SIGN VIRAMA 

136 0x115BF, # SIDDHAM SIGN VIRAMA 

137 0x1163F, # MODI SIGN VIRAMA 

138 0x116B6, # TAKRI SIGN VIRAMA 

139 0x11839, # DOGRA SIGN VIRAMA 

140 0x119E0, # NANDINAGARI SIGN VIRAMA 

141 0x11C3F, # BHAIKSUKI SIGN VIRAMA 

142)) 

143_ISC_CONSONANT_TABLE = ISC_CONSONANT 

144 

145# In 'parse' mode, strings longer than this are checked for cursor-movement 

146# controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to 

147# 'ignore' to skip character-by-character parsing. The detection scan cost is 

148# negligible for long strings but wasted on short ones like labels or headings. 

149_WIDTH_FAST_PATH_MIN_LEN = 20 

150 

151# Translation table to strip C0/C1 control characters for fast 'ignore' mode. 

152_CONTROL_CHAR_TABLE = str.maketrans('', '', ( 

153 ''.join(chr(c) for c in range(0x00, 0x20)) + # C0: NUL through US (including tab) 

154 '\x7f' + # DEL 

155 ''.join(chr(c) for c in range(0x80, 0xa0)) # C1: U+0080-U+009F 

156)) 

157 

158# Unlike wcwidth.__all__, wcwidth.wcwidth.__all__ is NOT for the purpose of defining a public API, 

159# or what we prefer to be imported with statement, "from wcwidth.wcwidth import *". Explicitly 

160# re-export imports here for no other reason than to satisfy the type checkers (mypy). Yak shavings. 

161__all__ = ( 

162 'ZERO_WIDTH', 

163 'WIDE_EASTASIAN', 

164 'AMBIGUOUS_EASTASIAN', 

165 'VS16_NARROW_TO_WIDE', 

166 'list_versions', 

167 'wcwidth', 

168 'wcswidth', 

169 'width', 

170 'iter_sequences', 

171 'ljust', 

172 'rjust', 

173 'center', 

174 'clip', 

175 'strip_sequences', 

176 '_wcmatch_version', 

177 '_wcversion_value', 

178) 

179 

180 

181# maxsize=1024: western scripts need ~64 unique codepoints per session, but 

182# CJK sessions may use ~2000 of ~3500 common hanzi/kanji. 1024 accommodates 

183# heavy CJK use. Performance floor at 32; bisearch is ~100ns per miss. 

184 

185@lru_cache(maxsize=1024) 

186def wcwidth(wc: str, unicode_version: str = 'auto', ambiguous_width: int = 1) -> int: # pylint: disable=unused-argument 

187 r""" 

188 Given one Unicode codepoint, return its printable length on a terminal. 

189 

190 :param wc: A single Unicode character. 

191 :param unicode_version: Ignored. Retained for backwards compatibility. 

192 

193 .. deprecated:: 0.3.0 

194 Only the latest Unicode version is now shipped. 

195 

196 :param ambiguous_width: Width to use for East Asian Ambiguous (A) 

197 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts 

198 where ambiguous characters display as double-width. See 

199 :ref:`ambiguous_width` for details. 

200 :returns: The width, in cells, necessary to display the character of 

201 Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has 

202 no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is 

203 not printable, or has an indeterminate effect on the terminal, such as 

204 a control character. Otherwise, the number of column positions the 

205 character occupies on a graphic terminal (1 or 2) is returned. 

206 

207 See :ref:`Specification` for details of cell measurement. 

208 """ 

209 ucs = ord(wc) if wc else 0 

210 

211 # small optimization: early return of 1 for printable ASCII, this provides 

212 # approximately 40% performance improvement for mostly-ascii documents, with 

213 # less than 1% impact to others. 

214 if 32 <= ucs < 0x7f: 

215 return 1 

216 

217 # C0/C1 control characters are -1 for compatibility with POSIX-like calls 

218 if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: 

219 return -1 

220 

221 # Zero width 

222 if _bisearch(ucs, _ZERO_WIDTH_TABLE): 

223 return 0 

224 

225 # Wide (F/W categories) 

226 if _bisearch(ucs, _WIDE_EASTASIAN_TABLE): 

227 return 2 

228 

229 # Ambiguous width (A category) - only when ambiguous_width=2 

230 if ambiguous_width == 2 and _bisearch(ucs, _AMBIGUOUS_TABLE): 

231 return 2 

232 

233 return 1 

234 

235 

236def wcswidth( 

237 pwcs: str, 

238 n: int | None = None, 

239 unicode_version: str = 'auto', 

240 ambiguous_width: int = 1, 

241) -> int: 

242 """ 

243 Given a unicode string, return its printable length on a terminal. 

244 

245 :param pwcs: Measure width of given unicode string. 

246 :param n: When ``n`` is None (default), return the length of the entire 

247 string, otherwise only the first ``n`` characters are measured. 

248 

249 Better to use string slicing capability, ``wcswidth(pwcs[:n])``, instead, 

250 for performance. This argument is a holdover from the POSIX function for 

251 matching signatures. Be careful that ``n`` is at grapheme boundaries. 

252 

253 :param unicode_version: Ignored. Retained for backwards compatibility. 

254 

255 .. deprecated:: 0.3.0 

256 Only the latest Unicode version is now shipped. 

257 

258 :param ambiguous_width: Width to use for East Asian Ambiguous (A) 

259 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. 

260 :returns: The width, in cells, needed to display the first ``n`` characters 

261 of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control 

262 characters! 

263 

264 See :ref:`Specification` for details of cell measurement. 

265 """ 

266 # pylint: disable=unused-argument,too-many-locals,too-many-statements 

267 # pylint: disable=too-complex,too-many-branches 

268 # This function intentionally kept long without delegating functions to reduce function calls in 

269 # "hot path", the overhead per-character adds up. 

270 

271 # Fast path: pure ASCII printable strings are always width == length 

272 if n is None and pwcs.isascii() and pwcs.isprintable(): 

273 return len(pwcs) 

274 

275 # Select wcwidth call pattern for best lru_cache performance: 

276 # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls 

277 # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) 

278 _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) 

279 

280 end = len(pwcs) if n is None else n 

281 total_width = 0 

282 idx = 0 

283 last_measured_idx = -2 # Track index of last measured char for VS16 

284 last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check) 

285 last_was_virama = False # Virama conjunct formation state 

286 conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc) 

287 while idx < end: 

288 char = pwcs[idx] 

289 ucs = ord(char) 

290 if ucs == 0x200D: 

291 if last_was_virama: 

292 # ZWJ after virama requests explicit half-form rendering but 

293 # does not change cell count — consume ZWJ only, let the next 

294 # consonant be handled by the virama conjunct rule. 

295 idx += 1 

296 elif idx + 1 < end: 

297 # Emoji ZWJ: skip next character unconditionally. 

298 idx += 2 

299 last_was_virama = False 

300 else: 

301 idx += 1 

302 last_was_virama = False 

303 continue 

304 if ucs == 0xFE0F and last_measured_idx >= 0: 

305 # VS16 following a measured character: add 1 if that character is 

306 # known to be converted from narrow to wide by VS16. 

307 total_width += _bisearch(ord(pwcs[last_measured_idx]), 

308 VS16_NARROW_TO_WIDE["9.0.0"]) 

309 last_measured_idx = -2 # Prevent double application 

310 # VS16 preserves emoji context: last_measured_ucs stays as the base 

311 idx += 1 

312 continue 

313 # Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+) 

314 if ucs > 0xFFFF: 

315 if ucs in _REGIONAL_INDICATOR_SET: 

316 # Lazy RI pairing: count preceding consecutive RIs only when the last one is 

317 # received, because RI's are received so rarely its better than per-loop tracking of 

318 # 'last char was an RI'. 

319 ri_before = 0 

320 j = idx - 1 

321 while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: 

322 ri_before += 1 

323 j -= 1 

324 if ri_before % 2 == 1: 

325 # Second RI in pair: contributes 0 (pair = one 2-cell flag) using an even-or-odd 

326 # check to determine, 'CAUS' would be two flags, but 'CAU' would be 1 flag 

327 # and wide 'U'. 

328 idx += 1 

329 last_measured_ucs = ucs 

330 continue 

331 # First or unpaired RI: measured normally (width 2 from table) 

332 # Fitzpatrick modifier: zero-width when following emoji base 

333 elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] 

334 and last_measured_ucs in _EMOJI_ZWJ_SET): 

335 idx += 1 

336 continue 

337 # Virama conjunct formation: consonant following virama contributes 0 width. 

338 # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category 

339 if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE): 

340 last_measured_idx = idx 

341 last_measured_ucs = ucs 

342 last_was_virama = False 

343 conjunct_pending = True 

344 idx += 1 

345 continue 

346 wcw = _wcwidth(char) 

347 if wcw < 0: 

348 # early return -1 on C0 and C1 control characters 

349 return wcw 

350 if wcw > 0: 

351 if conjunct_pending: 

352 total_width += 1 

353 conjunct_pending = False 

354 last_measured_idx = idx 

355 last_measured_ucs = ucs 

356 last_was_virama = False 

357 elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE): 

358 # Spacing Combining Mark (Mc) following a base character adds 1 

359 wcw = 1 

360 last_measured_idx = -2 

361 last_was_virama = False 

362 conjunct_pending = False 

363 else: 

364 last_was_virama = ucs in _ISC_VIRAMA_SET 

365 total_width += wcw 

366 idx += 1 

367 if conjunct_pending: 

368 total_width += 1 

369 return total_width 

370 

371 

372# NOTE: _wcversion_value and _wcmatch_version are no longer used internally 

373# by wcwidth since version 0.5.0 (only the latest Unicode version is shipped). 

374# 

375# They are retained for API compatibility with external tools like ucs-detect 

376# that may use these private functions. 

377 

378 

379@lru_cache(maxsize=128) 

380def _wcversion_value(ver_string: str) -> tuple[int, ...]: # pragma: no cover 

381 """ 

382 Integer-mapped value of given dotted version string. 

383 

384 .. deprecated:: 0.3.0 

385 

386 This function is no longer used internally by wcwidth but is retained 

387 for API compatibility with external tools. 

388 

389 :param ver_string: Unicode version string, of form ``n.n.n``. 

390 :returns: tuple of digit tuples, ``tuple(int, [...])``. 

391 """ 

392 retval = tuple(map(int, (ver_string.split('.')))) 

393 return retval 

394 

395 

396@lru_cache(maxsize=8) 

397def _wcmatch_version(given_version: str) -> str: # pylint: disable=unused-argument 

398 """ 

399 Return the supported Unicode version level. 

400 

401 .. deprecated:: 0.3.0 

402 This function now always returns the latest version. 

403 

404 This function is no longer used internally by wcwidth but is retained 

405 for API compatibility with external tools. 

406 

407 :param given_version: Ignored. Any value is accepted for compatibility. 

408 :returns: The latest unicode version string. 

409 """ 

410 return _LATEST_VERSION 

411 

412 

413def iter_sequences(text: str) -> Iterator[tuple[str, bool]]: 

414 r""" 

415 Iterate through text, yielding segments with sequence identification. 

416 

417 This generator yields tuples of ``(segment, is_sequence)`` for each part 

418 of the input text, where ``is_sequence`` is ``True`` if the segment is 

419 a recognized terminal escape sequence. 

420 

421 :param text: String to iterate through. 

422 :returns: Iterator of (segment, is_sequence) tuples. 

423 

424 .. versionadded:: 0.3.0 

425 

426 Example:: 

427 

428 >>> list(iter_sequences('hello')) 

429 [('hello', False)] 

430 >>> list(iter_sequences('\x1b[31mred')) 

431 [('\x1b[31m', True), ('red', False)] 

432 >>> list(iter_sequences('\x1b[1m\x1b[31m')) 

433 [('\x1b[1m', True), ('\x1b[31m', True)] 

434 """ 

435 idx = 0 

436 text_len = len(text) 

437 segment_start = 0 

438 

439 while idx < text_len: 

440 char = text[idx] 

441 

442 if char == '\x1b': 

443 # Yield any accumulated non-sequence text 

444 if idx > segment_start: 

445 yield (text[segment_start:idx], False) 

446 

447 # Try to match an escape sequence 

448 match = ZERO_WIDTH_PATTERN.match(text, idx) 

449 if match: 

450 yield (match.group(), True) 

451 idx = match.end() 

452 else: 

453 # Lone ESC or unrecognized - yield as sequence anyway 

454 yield (char, True) 

455 idx += 1 

456 segment_start = idx 

457 else: 

458 idx += 1 

459 

460 # Yield any remaining text 

461 if segment_start < text_len: 

462 yield (text[segment_start:], False) 

463 

464 

465def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int: 

466 """ 

467 Fast path for width() with control_codes='ignore'. 

468 

469 Strips escape sequences and control characters, then measures remaining text. 

470 """ 

471 return wcswidth( 

472 strip_sequences(text).translate(_CONTROL_CHAR_TABLE), 

473 ambiguous_width=ambiguous_width 

474 ) 

475 

476 

477def width( 

478 text: str, 

479 *, 

480 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', 

481 tabsize: int = 8, 

482 ambiguous_width: int = 1, 

483) -> int: 

484 r""" 

485 Return printable width of text containing many kinds of control codes and sequences. 

486 

487 Unlike :func:`wcswidth`, this function handles most control characters and many popular terminal 

488 output sequences. Never returns -1. 

489 

490 :param text: String to measure. 

491 :param control_codes: How to handle control characters and sequences: 

492 

493 - ``'parse'`` (default): Track horizontal cursor movement from BS ``\b``, CR ``\r``, TAB 

494 ``\t``, and cursor left and right movement sequences. Vertical movement (LF, VT, FF) and 

495 indeterminate sequences are zero-width. Never raises. 

496 - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with 

497 indeterminate results of the screen or cursor, like clear or vertical movement. Generally, 

498 these should be handled with a virtual terminal emulator (like 'pyte'). 

499 - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as 

500 width 0. This is the fastest measurement for text already filtered or known not to contain 

501 any kinds of control codes or sequences. TAB ``\t`` is zero-width; for tab expansion, 

502 pre-process: ``text.replace('\t', ' ' * 8)``. 

503 

504 :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8. 

505 Must be positive. Has no effect when ``control_codes='ignore'``. 

506 :param ambiguous_width: Width to use for East Asian Ambiguous (A) 

507 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. 

508 :returns: Maximum cursor position reached, "extent", accounting for cursor movement sequences 

509 present in ``text`` according to given parameters. This represents the rightmost column the 

510 cursor reaches. Always a non-negative integer. 

511 

512 :raises ValueError: If ``control_codes='strict'`` and control characters with indeterminate 

513 effects, such as vertical movement or clear sequences are encountered, or on unexpected 

514 C0 or C1 control code. Also raised when ``control_codes`` is not one of the valid values. 

515 

516 .. versionadded:: 0.3.0 

517 

518 Examples:: 

519 

520 >>> width('hello') 

521 5 

522 >>> width('コンニチハ') 

523 10 

524 >>> width('\x1b[31mred\x1b[0m') 

525 3 

526 >>> width('\x1b[31mred\x1b[0m', control_codes='ignore') # same result (ignored) 

527 3 

528 >>> width('123\b4') # backspace overwrites previous cell (outputs '124') 

529 3 

530 >>> width('abc\t') # tab caused cursor to move to column 8 

531 8 

532 >>> width('1\x1b[10C') # '1' + cursor right 10, cursor ends on column 11 

533 11 

534 >>> width('1\x1b[10C', control_codes='ignore') # faster but wrong in this case 

535 1 

536 """ 

537 # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals 

538 # This could be broken into sub-functions (#1, #3, and 6 especially), but for reduced overhead 

539 # considering this function is a likely "hot path", they are inlined, breaking many of our 

540 # complexity rules. 

541 

542 # Fast path for ASCII printable (no tabs, escapes, or control chars) 

543 if text.isascii() and text.isprintable(): 

544 return len(text) 

545 

546 # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode. 

547 # Only check for longer strings - the detection overhead hurts short string performance. 

548 if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN: 

549 # Check for cursor-affecting control characters 

550 if '\b' not in text and '\t' not in text and '\r' not in text: 

551 # Check for escape sequences - if none, or only non-cursor-movement sequences 

552 if '\x1b' not in text or ( 

553 not CURSOR_RIGHT_SEQUENCE.search(text) and 

554 not CURSOR_LEFT_SEQUENCE.search(text) 

555 ): 

556 control_codes = 'ignore' 

557 

558 # Fast path for ignore mode -- this is useful if you know the text is already "clean" 

559 if control_codes == 'ignore': 

560 return _width_ignored_codes(text, ambiguous_width) 

561 

562 strict = control_codes == 'strict' 

563 # Track absolute positions: tab stops need modulo on absolute column, CR resets to 0. 

564 # Initialize max_extent to 0 so backward movement (CR, BS) won't yield negative width. 

565 current_col = 0 

566 max_extent = 0 

567 idx = 0 

568 last_measured_idx = -2 # Track index of last measured char for VS16; -2 can never match idx-1 

569 last_measured_ucs = -1 # Codepoint of last measured char (for deferred emoji check) 

570 last_was_virama = False # Virama conjunct formation state 

571 conjunct_pending = False # Deferred +1 for bare conjuncts (no trailing Mc) 

572 text_len = len(text) 

573 

574 # Select wcwidth call pattern for best lru_cache performance: 

575 # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls 

576 # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) 

577 _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) 

578 

579 while idx < text_len: 

580 char = text[idx] 

581 

582 # 1. Handle ESC sequences 

583 if char == '\x1b': 

584 match = ZERO_WIDTH_PATTERN.match(text, idx) 

585 if match: 

586 seq = match.group() 

587 if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): 

588 raise ValueError(f"Indeterminate cursor sequence at position {idx}") 

589 # Apply cursor movement 

590 right = CURSOR_RIGHT_SEQUENCE.match(seq) 

591 if right: 

592 current_col += int(right.group(1) or 1) 

593 else: 

594 left = CURSOR_LEFT_SEQUENCE.match(seq) 

595 if left: 

596 current_col = max(0, current_col - int(left.group(1) or 1)) 

597 idx = match.end() 

598 else: 

599 idx += 1 

600 max_extent = max(max_extent, current_col) 

601 continue 

602 

603 # 2. Handle illegal and vertical control characters (zero width, error in strict) 

604 if char in ILLEGAL_CTRL: 

605 if strict: 

606 raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}") 

607 idx += 1 

608 continue 

609 

610 if char in VERTICAL_CTRL: 

611 if strict: 

612 raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}") 

613 idx += 1 

614 continue 

615 

616 # 3. Handle horizontal movement characters 

617 if char in HORIZONTAL_CTRL: 

618 if char == '\x09' and tabsize > 0: # Tab 

619 current_col += tabsize - (current_col % tabsize) 

620 elif char == '\x08': # Backspace 

621 if current_col > 0: 

622 current_col -= 1 

623 elif char == '\x0d': # Carriage return 

624 current_col = 0 

625 max_extent = max(max_extent, current_col) 

626 idx += 1 

627 continue 

628 

629 # 4. Handle ZWJ 

630 if char == '\u200D': 

631 if last_was_virama: 

632 # ZWJ after virama requests explicit half-form rendering but 

633 # does not change cell count — consume ZWJ only, let the next 

634 # consonant be handled by the virama conjunct rule. 

635 idx += 1 

636 elif idx + 1 < text_len: 

637 # Emoji ZWJ: skip next character unconditionally. 

638 idx += 2 

639 last_was_virama = False 

640 else: 

641 idx += 1 

642 last_was_virama = False 

643 continue 

644 

645 # 5. Handle other zero-width characters (control chars) 

646 if char in ZERO_WIDTH_CTRL: 

647 idx += 1 

648 continue 

649 

650 ucs = ord(char) 

651 

652 # 6. Handle VS16: converts preceding narrow character to wide 

653 if ucs == 0xFE0F: 

654 if last_measured_idx == idx - 1: 

655 if _bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE["9.0.0"]): 

656 current_col += 1 

657 max_extent = max(max_extent, current_col) 

658 # VS16 preserves emoji context: last_measured_ucs stays as the base 

659 idx += 1 

660 continue 

661 

662 # 6b. Regional Indicator & Fitzpatrick: both above BMP (U+1F1E6+) 

663 if ucs > 0xFFFF: 

664 if ucs in _REGIONAL_INDICATOR_SET: 

665 # Lazy RI pairing: count preceding consecutive RIs 

666 ri_before = 0 

667 j = idx - 1 

668 while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET: 

669 ri_before += 1 

670 j -= 1 

671 if ri_before % 2 == 1: 

672 last_measured_ucs = ucs 

673 idx += 1 

674 continue 

675 # 6c. Fitzpatrick modifier: zero-width when following emoji base 

676 elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] 

677 and last_measured_ucs in _EMOJI_ZWJ_SET): 

678 idx += 1 

679 continue 

680 

681 # 7. Virama conjunct formation: consonant following virama contributes 0 width. 

682 # See https://www.unicode.org/reports/tr44/#Indic_Syllabic_Category 

683 if last_was_virama and _bisearch(ucs, _ISC_CONSONANT_TABLE): 

684 last_measured_idx = idx 

685 last_measured_ucs = ucs 

686 last_was_virama = False 

687 conjunct_pending = True 

688 idx += 1 

689 continue 

690 

691 # 8. Normal characters: measure with wcwidth 

692 w = _wcwidth(char) 

693 if w > 0: 

694 if conjunct_pending: 

695 current_col += 1 

696 conjunct_pending = False 

697 current_col += w 

698 max_extent = max(max_extent, current_col) 

699 last_measured_idx = idx 

700 last_measured_ucs = ucs 

701 last_was_virama = False 

702 elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE): 

703 # Spacing Combining Mark (Mc) following a base character adds 1 

704 current_col += 1 

705 max_extent = max(max_extent, current_col) 

706 last_measured_idx = -2 

707 last_was_virama = False 

708 conjunct_pending = False 

709 else: 

710 last_was_virama = ucs in _ISC_VIRAMA_SET 

711 idx += 1 

712 

713 if conjunct_pending: 

714 current_col += 1 

715 max_extent = max(max_extent, current_col) 

716 return max_extent 

717 

718 

719def ljust( 

720 text: str, 

721 dest_width: int, 

722 fillchar: str = ' ', 

723 *, 

724 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', 

725 ambiguous_width: int = 1, 

726) -> str: 

727 r""" 

728 Return text left-justified in a string of given display width. 

729 

730 :param text: String to justify, may contain terminal sequences. 

731 :param dest_width: Total display width of result in terminal cells. 

732 :param fillchar: Single character for padding (default space). Must have 

733 display width of 1 (not wide, not zero-width, not combining). Unicode 

734 characters like ``'·'`` are acceptable. The width is not validated. 

735 :param control_codes: How to handle control sequences when measuring. 

736 Passed to :func:`width` for measurement. 

737 :param ambiguous_width: Width to use for East Asian Ambiguous (A) 

738 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. 

739 :returns: Text padded on the right to reach ``dest_width``. 

740 

741 .. versionadded:: 0.3.0 

742 

743 Example:: 

744 

745 >>> wcwidth.ljust('hi', 5) 

746 'hi ' 

747 >>> wcwidth.ljust('\x1b[31mhi\x1b[0m', 5) 

748 '\x1b[31mhi\x1b[0m ' 

749 >>> wcwidth.ljust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) 

750 '👨‍👩‍👧 ' 

751 """ 

752 if text.isascii() and text.isprintable(): 

753 text_width = len(text) 

754 else: 

755 text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) 

756 padding_cells = max(0, dest_width - text_width) 

757 return text + fillchar * padding_cells 

758 

759 

760def rjust( 

761 text: str, 

762 dest_width: int, 

763 fillchar: str = ' ', 

764 *, 

765 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', 

766 ambiguous_width: int = 1, 

767) -> str: 

768 r""" 

769 Return text right-justified in a string of given display width. 

770 

771 :param text: String to justify, may contain terminal sequences. 

772 :param dest_width: Total display width of result in terminal cells. 

773 :param fillchar: Single character for padding (default space). Must have 

774 display width of 1 (not wide, not zero-width, not combining). Unicode 

775 characters like ``'·'`` are acceptable. The width is not validated. 

776 :param control_codes: How to handle control sequences when measuring. 

777 Passed to :func:`width` for measurement. 

778 :param ambiguous_width: Width to use for East Asian Ambiguous (A) 

779 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. 

780 :returns: Text padded on the left to reach ``dest_width``. 

781 

782 .. versionadded:: 0.3.0 

783 

784 Example:: 

785 

786 >>> wcwidth.rjust('hi', 5) 

787 ' hi' 

788 >>> wcwidth.rjust('\x1b[31mhi\x1b[0m', 5) 

789 ' \x1b[31mhi\x1b[0m' 

790 >>> wcwidth.rjust('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) 

791 ' 👨‍👩‍👧' 

792 """ 

793 if text.isascii() and text.isprintable(): 

794 text_width = len(text) 

795 else: 

796 text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) 

797 padding_cells = max(0, dest_width - text_width) 

798 return fillchar * padding_cells + text 

799 

800 

801def center( 

802 text: str, 

803 dest_width: int, 

804 fillchar: str = ' ', 

805 *, 

806 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', 

807 ambiguous_width: int = 1, 

808) -> str: 

809 r""" 

810 Return text centered in a string of given display width. 

811 

812 :param text: String to center, may contain terminal sequences. 

813 :param dest_width: Total display width of result in terminal cells. 

814 :param fillchar: Single character for padding (default space). Must have 

815 display width of 1 (not wide, not zero-width, not combining). Unicode 

816 characters like ``'·'`` are acceptable. The width is not validated. 

817 :param control_codes: How to handle control sequences when measuring. 

818 Passed to :func:`width` for measurement. 

819 :param ambiguous_width: Width to use for East Asian Ambiguous (A) 

820 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. 

821 :returns: Text padded on both sides to reach ``dest_width``. 

822 

823 For odd-width padding, the extra cell goes on the right (matching 

824 Python's :meth:`str.center` behavior). 

825 

826 .. versionadded:: 0.3.0 

827 

828 Example:: 

829 

830 >>> wcwidth.center('hi', 6) 

831 ' hi ' 

832 >>> wcwidth.center('\x1b[31mhi\x1b[0m', 6) 

833 ' \x1b[31mhi\x1b[0m ' 

834 >>> wcwidth.center('\U0001F468\u200D\U0001F469\u200D\U0001F467', 6) 

835 ' 👨‍👩‍👧 ' 

836 """ 

837 if text.isascii() and text.isprintable(): 

838 text_width = len(text) 

839 else: 

840 text_width = width(text, control_codes=control_codes, ambiguous_width=ambiguous_width) 

841 total_padding = max(0, dest_width - text_width) 

842 # matching https://jazcap53.github.io/pythons-eccentric-strcenter.html 

843 left_pad = total_padding // 2 + (total_padding & dest_width & 1) 

844 right_pad = total_padding - left_pad 

845 return fillchar * left_pad + text + fillchar * right_pad 

846 

847 

848def strip_sequences(text: str) -> str: 

849 r""" 

850 Return text with all terminal escape sequences removed. 

851 

852 Unknown or incomplete ESC sequences are preserved. 

853 

854 :param text: String that may contain terminal escape sequences. 

855 :returns: The input text with all escape sequences stripped. 

856 

857 .. versionadded:: 0.3.0 

858 

859 Example:: 

860 

861 >>> strip_sequences('\x1b[31mred\x1b[0m') 

862 'red' 

863 >>> strip_sequences('hello') 

864 'hello' 

865 >>> strip_sequences('\x1b[1m\x1b[31mbold red\x1b[0m text') 

866 'bold red text' 

867 """ 

868 return ZERO_WIDTH_PATTERN.sub('', text) 

869 

870 

871def clip( 

872 text: str, 

873 start: int, 

874 end: int, 

875 *, 

876 fillchar: str = ' ', 

877 tabsize: int = 8, 

878 ambiguous_width: int = 1, 

879 propagate_sgr: bool = True, 

880) -> str: 

881 r""" 

882 Clip text to display columns ``(start, end)`` while preserving all terminal sequences. 

883 

884 This function extracts a substring based on visible column positions rather than 

885 character indices. Terminal escape sequences are preserved in the output since 

886 they have zero display width. If a wide character (width 2) would be split at 

887 either boundary, it is replaced with ``fillchar``. 

888 

889 TAB characters (``\t``) are expanded to spaces up to the next tab stop, 

890 controlled by the ``tabsize`` parameter. 

891 

892 Other cursor movement characters (backspace, carriage return) and cursor 

893 movement sequences are passed through unchanged as zero-width. 

894 

895 :param text: String to clip, may contain terminal escape sequences. 

896 :param start: Absolute starting column (inclusive, 0-indexed). 

897 :param end: Absolute ending column (exclusive). 

898 :param fillchar: Character to use when a wide character must be split at 

899 a boundary (default space). Must have display width of 1. 

900 :param tabsize: Tab stop width (default 8). Set to 0 to pass tabs through 

901 as zero-width (preserved in output but don't advance column position). 

902 :param ambiguous_width: Width to use for East Asian Ambiguous (A) 

903 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. 

904 :param propagate_sgr: If True (default), SGR (terminal styling) sequences 

905 are propagated. The result begins with any active style at the start 

906 position and ends with a reset sequence if styles are active. 

907 :returns: Substring of ``text`` spanning display columns ``(start, end)``, 

908 with all terminal sequences preserved and wide characters at boundaries 

909 replaced with ``fillchar``. 

910 

911 SGR (terminal styling) sequences are propagated by default. The result 

912 begins with any active style and ends with a reset:: 

913 

914 >>> clip('\x1b[1;34mHello world\x1b[0m', 6, 11) 

915 '\x1b[1;34mworld\x1b[0m' 

916 

917 Set ``propagate_sgr=False`` to disable this behavior. 

918 

919 .. versionadded:: 0.3.0 

920 

921 .. versionchanged:: 0.5.0 

922 Added ``propagate_sgr`` parameter (default True). 

923 

924 Example:: 

925 

926 >>> clip('hello world', 0, 5) 

927 'hello' 

928 >>> clip('中文字', 0, 3) # Wide char split at column 3 

929 '中 ' 

930 >>> clip('a\tb', 0, 10) # Tab expanded to spaces 

931 'a b' 

932 """ 

933 # pylint: disable=too-complex,too-many-locals,too-many-branches,too-many-statements,too-many-nested-blocks 

934 # Again, for 'hot path', we avoid additional delegate functions and accept the cost 

935 # of complexity for improved python performance. 

936 start = max(start, 0) 

937 if end <= start: 

938 return '' 

939 

940 # Fast path: printable ASCII only (no tabs, escape sequences, or wide or zero-width chars) 

941 if text.isascii() and text.isprintable(): 

942 return text[start:end] 

943 

944 # Fast path: no escape sequences means no SGR tracking needed 

945 if propagate_sgr and '\x1b' not in text: 

946 propagate_sgr = False 

947 

948 # SGR tracking state (only when propagate_sgr=True) 

949 sgr_at_clip_start = None # state when first visible char emitted (None = not yet) 

950 if propagate_sgr: 

951 sgr = _SGR_STATE_DEFAULT # current SGR state, updated by all sequences 

952 

953 output: list[str] = [] 

954 col = 0 

955 idx = 0 

956 

957 while idx < len(text): 

958 char = text[idx] 

959 

960 # Early exit: past visible region, SGR captured, no escape ahead 

961 if col >= end and sgr_at_clip_start is not None and char != '\x1b': 

962 break 

963 

964 # Handle escape sequences 

965 if char == '\x1b' and (match := ZERO_WIDTH_PATTERN.match(text, idx)): 

966 seq = match.group() 

967 if propagate_sgr and _SGR_PATTERN.match(seq): 

968 # Update SGR state; will be applied as prefix when visible content starts 

969 sgr = _sgr_state_update(sgr, seq) 

970 else: 

971 # Non-SGR sequences always preserved 

972 output.append(seq) 

973 idx = match.end() 

974 continue 

975 

976 # Handle bare ESC (not a valid sequence) 

977 if char == '\x1b': 

978 output.append(char) 

979 idx += 1 

980 continue 

981 

982 # TAB expansion 

983 if char == '\t': 

984 if tabsize > 0: 

985 next_tab = col + (tabsize - (col % tabsize)) 

986 while col < next_tab: 

987 if start <= col < end: 

988 output.append(' ') 

989 if propagate_sgr and sgr_at_clip_start is None: 

990 sgr_at_clip_start = sgr 

991 col += 1 

992 else: 

993 output.append(char) 

994 idx += 1 

995 continue 

996 

997 # Grapheme clustering for everything else 

998 grapheme = next(iter_graphemes(text, start=idx)) 

999 w = width(grapheme, ambiguous_width=ambiguous_width) 

1000 

1001 if w == 0: 

1002 if start <= col < end: 

1003 output.append(grapheme) 

1004 elif col >= start and col + w <= end: 

1005 # Fully visible 

1006 output.append(grapheme) 

1007 if propagate_sgr and sgr_at_clip_start is None: 

1008 sgr_at_clip_start = sgr 

1009 col += w 

1010 elif col < end and col + w > start: 

1011 # Partially visible (wide char at boundary) 

1012 output.append(fillchar * (min(end, col + w) - max(start, col))) 

1013 if propagate_sgr and sgr_at_clip_start is None: 

1014 sgr_at_clip_start = sgr 

1015 col += w 

1016 else: 

1017 col += w 

1018 

1019 idx += len(grapheme) 

1020 

1021 result = ''.join(output) 

1022 

1023 # Apply SGR prefix/suffix 

1024 if sgr_at_clip_start is not None: 

1025 if prefix := _sgr_state_to_sequence(sgr_at_clip_start): 

1026 result = prefix + result 

1027 if _sgr_state_is_active(sgr_at_clip_start): 

1028 result += '\x1b[0m' 

1029 

1030 return result