Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/_width.py: 9%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""This is a high-level width() supporting terminal output."""
3from typing import Literal
5# local
6from ._wcwidth import wcwidth
7from .bisearch import bisearch
8from ._wcswidth import wcswidth
9from ._constants import (_EMOJI_ZWJ_SET,
10 _ISC_VIRAMA_SET,
11 _CATEGORY_MC_TABLE,
12 _FITZPATRICK_RANGE,
13 _REGIONAL_INDICATOR_SET)
14from .table_vs16 import VS16_NARROW_TO_WIDE
15from .text_sizing import TextSizing, TextSizingParams
16from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL
17from .table_grapheme import ISC_CONSONANT
18from .escape_sequences import (_SEQUENCE_CLASSIFY,
19 TEXT_SIZING_PATTERN,
20 CURSOR_MOVEMENT_SEQUENCE,
21 INDETERMINATE_EFFECT_SEQUENCE,
22 strip_sequences)
24# In 'parse' mode, strings longer than this are checked for cursor-movement
25# controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to
26# 'ignore' to skip character-by-character parsing. The detection scan cost is
27# negligible for long strings but wasted on short ones like labels or headings.
28_WIDTH_FAST_PATH_MIN_LEN = 20
30# Translation table to strip C0/C1 control characters for fast 'ignore' mode.
31_CONTROL_CHAR_TABLE = str.maketrans('', '', (
32 ''.join(chr(c) for c in range(0x00, 0x20)) + # C0: NUL through US (including tab)
33 '\x7f' + # DEL
34 ''.join(chr(c) for c in range(0x80, 0xa0)) # C1: U+0080-U+009F
35))
38def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int:
39 """
40 Fast path for width() with control_codes='ignore'.
42 Strips escape sequences and control characters, then measures remaining text.
43 """
44 return wcswidth(
45 strip_sequences(text).translate(_CONTROL_CHAR_TABLE),
46 ambiguous_width=ambiguous_width
47 )
50def width(
51 text: str,
52 *,
53 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse',
54 tabsize: int = 8,
55 ambiguous_width: int = 1,
56) -> int:
57 r"""
58 Return printable width of text containing many kinds of control codes and sequences.
60 Unlike :func:`wcswidth`, this function handles most control characters and many popular terminal
61 output sequences. Never returns -1.
63 :param text: String to measure.
64 :param control_codes: How to handle control characters and sequences:
66 - ``'parse'`` (default): Track horizontal cursor movement like BS ``\b``, CR ``\r``, TAB
67 ``\t``, cursor left and right movement sequences. Vertical movement (LF, VT, FF) and
68 indeterminate terminal sequences are zero-width. OSC 66 Kitty Text Sizing protocol, OSC 8
69 Hyperlink, and many other kinds of output sequences are parsed for displayed measurements.
70 - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with
71 indeterminate results of the screen or cursor, like clear or vertical movement. Generally,
72 these should be handled with a virtual terminal emulator (like 'pyte').
73 - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as
74 width 0. This is the fastest measurement for text already filtered or known not to contain
75 any kinds of control codes or sequences. TAB ``\t`` is zero-width; to ensure
76 tab expansion, pre-process text using :func:`str.expandtabs`.
78 :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8.
79 Must be positive. Has no effect when ``control_codes='ignore'``.
80 :param ambiguous_width: Width to use for East Asian Ambiguous (A)
81 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts.
82 :returns: Maximum cursor position reached, "extent", accounting for cursor movement sequences
83 present in ``text`` according to given parameters. This represents the rightmost column the
84 cursor reaches. Always a non-negative integer.
86 :raises ValueError: If ``control_codes='strict'`` and control characters with indeterminate
87 effects, such as vertical movement or clear sequences are encountered, or on unexpected
88 C0 or C1 control code. Also raised when ``control_codes`` is not one of the valid values.
90 .. versionadded:: 0.3.0
92 .. versionchanged:: 0.7.0
93 Expanded strict-mode to raise :exc:`ValueError` when cursor-left movement
94 (CSI D) would move beyond the beginning of the string. Previously, cursor-left
95 was silently clamped to column 0 in all modes.
97 Support horizontal cursor sequences (``cub``, ``cuf``, ``hpa``). Cursor-left (``cub``) or
98 backspace (``\b``) now overwrites text. ``column_address`` (``hpa``) and carriage return
99 (``\r``) are now parsed, and some values conditionally raise ``ValueError`` when
100 ``control_codes='parse'``.
102 Examples::
104 >>> width('hello')
105 5
106 >>> width('コンニチハ')
107 10
108 >>> width('\x1b[31mred\x1b[0m')
109 3
110 >>> width('\x1b[31mred\x1b[0m', control_codes='ignore') # same result (ignored)
111 3
112 >>> width('123\b4') # backspace overwrites previous cell (outputs '124')
113 3
114 >>> width('abc\t') # tab caused cursor to move to column 8
115 8
116 >>> width('1\x1b[10C') # '1' + cursor right 10, cursor ends on column 11
117 11
118 >>> width('1\x1b[10C', control_codes='ignore') # faster but wrong in this case
119 1
120 """
121 # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals
122 # This could be broken into sub-functions (#1, #3, and #6 especially), but for reduced overhead
123 # in consideration of this function a likely "hot path", they are inline, breaking many pylint
124 # complexity rules.
126 # Fast path for ASCII printable (no tabs, escapes, or control chars)
127 if text.isascii() and text.isprintable():
128 return len(text)
130 # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode.
131 # Only check longer strings - the detection overhead hurts short string performance.
132 if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN:
133 # Check for cursor-affecting control characters
134 if '\b' not in text and '\t' not in text and '\r' not in text:
135 # Check for escape sequences, if none contain cursor movement or
136 # text sizing, downgrade to 'ignore'
137 if '\x1b' not in text or (
138 not CURSOR_MOVEMENT_SEQUENCE.search(text)
139 and not TEXT_SIZING_PATTERN.search(text)
140 ):
141 control_codes = 'ignore'
143 # Fast path for ignore mode, useful if you know the text is already free of control codes
144 if control_codes == 'ignore':
145 return _width_ignored_codes(text, ambiguous_width)
147 strict = control_codes == 'strict'
148 # Track absolute positions: tab stops need modulo on absolute column, CR resets to 0.
149 # Initialize max_extent to 0 so backward movement (CR, BS) won't yield negative width.
150 current_col = 0
151 max_extent = 0
152 idx = 0
153 text_len = len(text)
155 # Select wcwidth call pattern for best lru_cache performance:
156 # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls
157 # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct)
158 _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width)
160 # grapheme-clustering state
161 last_measured_idx = -2
162 last_measured_ucs = -1
163 last_was_virama = False
164 conjunct_pending = False
166 while idx < text_len:
167 char = text[idx]
169 # 1. ESC sequences
170 if char == '\x1b':
171 m = _SEQUENCE_CLASSIFY.match(text, idx)
172 if not m:
173 # 1a. Errant ESC or unknown sequence: only the first character is zero-width
174 idx += 1
175 else:
176 seq = m.group()
177 if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq):
178 raise ValueError(f"Indeterminate cursor sequence at position {idx}, {seq!r}")
180 # 2b. horizontal position absolute (before forward/backward to
181 # avoid other_seq match in _SEQUENCE_CLASSIFY)
182 if (hpa_n := m.group('hpa_n')) is not None:
183 target_col = int(hpa_n) if hpa_n else 1
184 if strict:
185 raise ValueError(
186 f"Indeterminate horizontal position at position {idx}, "
187 f"{seq!r} (absolute column unknown)"
188 )
189 current_col = target_col - 1 # HPA is 1-indexed, convert to 0-indexed
190 # 2c. cursor forward, backward
191 elif (cforward_n := m.group('cforward_n')) is not None:
192 current_col += int(cforward_n) if cforward_n else 1
193 elif (cbackward_n := m.group('cbackward_n')) is not None:
194 n_backward = int(cbackward_n) if cbackward_n else 1
195 if strict and n_backward > current_col:
196 raise ValueError(
197 f"Cursor left movement at position {idx} would move "
198 f"{n_backward} cells left from column {current_col}, "
199 f"exceeding string start"
200 )
201 current_col = max(0, current_col - n_backward)
202 # 2d. OSC 66 Text Sizing — has positive display width
203 elif (ts_meta := m.group('ts_meta')) is not None:
204 ts_text = m.group('ts_text')
205 ts_term = m.group('ts_term')
206 assert ts_text is not None and ts_term is not None
207 text_size = TextSizing(
208 TextSizingParams.from_params(ts_meta, control_codes=control_codes),
209 ts_text, ts_term)
210 current_col += text_size.display_width(ambiguous_width)
211 # 2e. SGR and other zero-width sequences -- no column advance
212 idx = m.end()
213 # Escape sequences break VS16 adjacency: reset last-measured state
214 last_measured_idx = -2
215 last_measured_ucs = -1
216 max_extent = max(max_extent, current_col)
217 continue
219 # 2. Vertical or Illegal control characters zero width or error when 'strict'
220 if char in ILLEGAL_CTRL:
221 if strict:
222 raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}")
223 idx += 1
224 last_measured_idx = -2
225 last_measured_ucs = -1
226 continue
228 if char in VERTICAL_CTRL:
229 if strict:
230 raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}")
231 idx += 1
232 last_measured_idx = -2
233 last_measured_ucs = -1
234 continue
236 # 3. Horizontal movement characters
237 if char in HORIZONTAL_CTRL:
238 if char == '\t' and tabsize > 0:
239 current_col += tabsize - (current_col % tabsize)
240 elif char == '\b':
241 if current_col > 0:
242 current_col -= 1
243 elif char == '\r':
244 if strict:
245 raise ValueError(
246 f"Horizontal movement character \\r at position {idx}: "
247 "indeterminate starting column"
248 )
249 current_col = 0
250 max_extent = max(max_extent, current_col)
251 idx += 1
252 last_measured_idx = -2
253 last_measured_ucs = -1
254 continue
256 # 4. Zero-width control characters
257 if char in ZERO_WIDTH_CTRL:
258 idx += 1
259 last_measured_idx = -2
260 last_measured_ucs = -1
261 continue
263 # 5. Inline grapheme-clustering: ZWJ, VS16, Regional Indicators,
264 # Fitzpatrick, Virama conjuncts, Mc, wcwidth
265 ucs = ord(char)
267 # ZWJ (U+200D)
268 if ucs == 0x200D:
269 if last_was_virama:
270 idx += 1
271 elif idx + 1 < text_len:
272 last_was_virama = False
273 idx += 2
274 else:
275 last_was_virama = False
276 idx += 1
277 continue
279 # VS16 (U+FE0F): converts preceding narrow character to wide.
280 if ucs == 0xFE0F and last_measured_idx >= 0:
281 if bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE['9.0.0']):
282 current_col += 1
283 max_extent = max(max_extent, current_col)
284 last_measured_idx = -2 # prevent double application
285 idx += 1
286 continue
288 # Regional Indicator & Fitzpatrick (both above BMP)
289 if ucs > 0xFFFF:
290 if ucs in _REGIONAL_INDICATOR_SET:
291 ri_before = 0
292 j = idx - 1
293 while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET:
294 ri_before += 1
295 j -= 1
296 if ri_before % 2 == 1:
297 last_measured_ucs = ucs
298 idx += 1
299 continue
300 elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1]
301 and last_measured_ucs in _EMOJI_ZWJ_SET):
302 idx += 1
303 continue
305 # Virama conjunct formation
306 if last_was_virama and bisearch(ucs, ISC_CONSONANT):
307 last_measured_idx = idx
308 last_measured_ucs = ucs
309 last_was_virama = False
310 conjunct_pending = True
311 idx += 1
312 continue
314 # Normal character: measure with wcwidth
315 w = _wcwidth(char)
316 if w > 0:
317 if conjunct_pending:
318 current_col += 1
319 conjunct_pending = False
320 current_col += w
321 max_extent = max(max_extent, current_col)
322 last_measured_idx = idx
323 last_measured_ucs = ucs
324 last_was_virama = False
325 elif last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE):
326 # Spacing Combining Mark (Mc) following a base character adds 1
327 current_col += 1
328 max_extent = max(max_extent, current_col)
329 last_measured_idx = -2
330 last_was_virama = False
331 conjunct_pending = False
332 else:
333 last_was_virama = ucs in _ISC_VIRAMA_SET
334 idx += 1
336 if conjunct_pending:
337 current_col += 1
338 max_extent = max(max_extent, current_col)
339 return max_extent