Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/_width.py: 9%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

160 statements  

1"""This is a high-level width() supporting terminal output.""" 

2 

3from typing import Literal 

4 

5# local 

6from ._wcwidth import wcwidth 

7from .bisearch import bisearch 

8from ._wcswidth import wcswidth 

9from ._constants import (_EMOJI_ZWJ_SET, 

10 _ISC_VIRAMA_SET, 

11 _CATEGORY_MC_TABLE, 

12 _FITZPATRICK_RANGE, 

13 _REGIONAL_INDICATOR_SET) 

14from .table_vs16 import VS16_NARROW_TO_WIDE 

15from .text_sizing import TextSizing, TextSizingParams 

16from .control_codes import ILLEGAL_CTRL, VERTICAL_CTRL, HORIZONTAL_CTRL, ZERO_WIDTH_CTRL 

17from .table_grapheme import ISC_CONSONANT 

18from .escape_sequences import (_SEQUENCE_CLASSIFY, 

19 TEXT_SIZING_PATTERN, 

20 CURSOR_MOVEMENT_SEQUENCE, 

21 INDETERMINATE_EFFECT_SEQUENCE, 

22 strip_sequences) 

23 

24# In 'parse' mode, strings longer than this are checked for cursor-movement 

25# controls (BS, TAB, CR, cursor sequences); when absent, mode downgrades to 

26# 'ignore' to skip character-by-character parsing. The detection scan cost is 

27# negligible for long strings but wasted on short ones like labels or headings. 

28_WIDTH_FAST_PATH_MIN_LEN = 20 

29 

30# Translation table to strip C0/C1 control characters for fast 'ignore' mode. 

31_CONTROL_CHAR_TABLE = str.maketrans('', '', ( 

32 ''.join(chr(c) for c in range(0x00, 0x20)) + # C0: NUL through US (including tab) 

33 '\x7f' + # DEL 

34 ''.join(chr(c) for c in range(0x80, 0xa0)) # C1: U+0080-U+009F 

35)) 

36 

37 

38def _width_ignored_codes(text: str, ambiguous_width: int = 1) -> int: 

39 """ 

40 Fast path for width() with control_codes='ignore'. 

41 

42 Strips escape sequences and control characters, then measures remaining text. 

43 """ 

44 return wcswidth( 

45 strip_sequences(text).translate(_CONTROL_CHAR_TABLE), 

46 ambiguous_width=ambiguous_width 

47 ) 

48 

49 

50def width( 

51 text: str, 

52 *, 

53 control_codes: Literal['parse', 'strict', 'ignore'] = 'parse', 

54 tabsize: int = 8, 

55 ambiguous_width: int = 1, 

56) -> int: 

57 r""" 

58 Return printable width of text containing many kinds of control codes and sequences. 

59 

60 Unlike :func:`wcswidth`, this function handles most control characters and many popular terminal 

61 output sequences. Never returns -1. 

62 

63 :param text: String to measure. 

64 :param control_codes: How to handle control characters and sequences: 

65 

66 - ``'parse'`` (default): Track horizontal cursor movement like BS ``\b``, CR ``\r``, TAB 

67 ``\t``, cursor left and right movement sequences. Vertical movement (LF, VT, FF) and 

68 indeterminate terminal sequences are zero-width. OSC 66 Kitty Text Sizing protocol, OSC 8 

69 Hyperlink, and many other kinds of output sequences are parsed for displayed measurements. 

70 - ``'strict'``: Like parse, but raises :exc:`ValueError` on control characters with 

71 indeterminate results of the screen or cursor, like clear or vertical movement. Generally, 

72 these should be handled with a virtual terminal emulator (like 'pyte'). 

73 - ``'ignore'``: All C0 and C1 control characters and escape sequences are measured as 

74 width 0. This is the fastest measurement for text already filtered or known not to contain 

75 any kinds of control codes or sequences. TAB ``\t`` is zero-width; to ensure 

76 tab expansion, pre-process text using :func:`str.expandtabs`. 

77 

78 :param tabsize: Tab stop width for ``'parse'`` and ``'strict'`` modes. Default is 8. 

79 Must be positive. Has no effect when ``control_codes='ignore'``. 

80 :param ambiguous_width: Width to use for East Asian Ambiguous (A) 

81 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. 

82 :returns: Maximum cursor position reached, "extent", accounting for cursor movement sequences 

83 present in ``text`` according to given parameters. This represents the rightmost column the 

84 cursor reaches. Always a non-negative integer. 

85 

86 :raises ValueError: If ``control_codes='strict'`` and control characters with indeterminate 

87 effects, such as vertical movement or clear sequences are encountered, or on unexpected 

88 C0 or C1 control code. Also raised when ``control_codes`` is not one of the valid values. 

89 

90 .. versionadded:: 0.3.0 

91 

92 .. versionchanged:: 0.7.0 

93 Expanded strict-mode to raise :exc:`ValueError` when cursor-left movement 

94 (CSI D) would move beyond the beginning of the string. Previously, cursor-left 

95 was silently clamped to column 0 in all modes. 

96 

97 Support horizontal cursor sequences (``cub``, ``cuf``, ``hpa``). Cursor-left (``cub``) or 

98 backspace (``\b``) now overwrites text. ``column_address`` (``hpa``) and carriage return 

99 (``\r``) are now parsed, and some values conditionally raise ``ValueError`` when 

100 ``control_codes='parse'``. 

101 

102 Examples:: 

103 

104 >>> width('hello') 

105 5 

106 >>> width('コンニチハ') 

107 10 

108 >>> width('\x1b[31mred\x1b[0m') 

109 3 

110 >>> width('\x1b[31mred\x1b[0m', control_codes='ignore') # same result (ignored) 

111 3 

112 >>> width('123\b4') # backspace overwrites previous cell (outputs '124') 

113 3 

114 >>> width('abc\t') # tab caused cursor to move to column 8 

115 8 

116 >>> width('1\x1b[10C') # '1' + cursor right 10, cursor ends on column 11 

117 11 

118 >>> width('1\x1b[10C', control_codes='ignore') # faster but wrong in this case 

119 1 

120 """ 

121 # pylint: disable=too-complex,too-many-branches,too-many-statements,too-many-locals 

122 # This could be broken into sub-functions (#1, #3, and #6 especially), but for reduced overhead 

123 # in consideration of this function a likely "hot path", they are inline, breaking many pylint 

124 # complexity rules. 

125 

126 # Fast path for ASCII printable (no tabs, escapes, or control chars) 

127 if text.isascii() and text.isprintable(): 

128 return len(text) 

129 

130 # Fast parse: if no horizontal cursor movements are possible, switch to 'ignore' mode. 

131 # Only check longer strings - the detection overhead hurts short string performance. 

132 if control_codes == 'parse' and len(text) > _WIDTH_FAST_PATH_MIN_LEN: 

133 # Check for cursor-affecting control characters 

134 if '\b' not in text and '\t' not in text and '\r' not in text: 

135 # Check for escape sequences, if none contain cursor movement or 

136 # text sizing, downgrade to 'ignore' 

137 if '\x1b' not in text or ( 

138 not CURSOR_MOVEMENT_SEQUENCE.search(text) 

139 and not TEXT_SIZING_PATTERN.search(text) 

140 ): 

141 control_codes = 'ignore' 

142 

143 # Fast path for ignore mode, useful if you know the text is already free of control codes 

144 if control_codes == 'ignore': 

145 return _width_ignored_codes(text, ambiguous_width) 

146 

147 strict = control_codes == 'strict' 

148 # Track absolute positions: tab stops need modulo on absolute column, CR resets to 0. 

149 # Initialize max_extent to 0 so backward movement (CR, BS) won't yield negative width. 

150 current_col = 0 

151 max_extent = 0 

152 idx = 0 

153 text_len = len(text) 

154 

155 # Select wcwidth call pattern for best lru_cache performance: 

156 # - ambiguous_width=1 (default): single-arg calls share cache with direct wcwidth() calls 

157 # - ambiguous_width=2: full positional args needed (results differ, separate cache is correct) 

158 _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) 

159 

160 # grapheme-clustering state 

161 last_measured_idx = -2 

162 last_measured_ucs = -1 

163 last_was_virama = False 

164 conjunct_pending = False 

165 

166 while idx < text_len: 

167 char = text[idx] 

168 

169 # 1. ESC sequences 

170 if char == '\x1b': 

171 m = _SEQUENCE_CLASSIFY.match(text, idx) 

172 if not m: 

173 # 1a. Errant ESC or unknown sequence: only the first character is zero-width 

174 idx += 1 

175 else: 

176 seq = m.group() 

177 if strict and INDETERMINATE_EFFECT_SEQUENCE.match(seq): 

178 raise ValueError(f"Indeterminate cursor sequence at position {idx}, {seq!r}") 

179 

180 # 2b. horizontal position absolute (before forward/backward to 

181 # avoid other_seq match in _SEQUENCE_CLASSIFY) 

182 if (hpa_n := m.group('hpa_n')) is not None: 

183 target_col = int(hpa_n) if hpa_n else 1 

184 if strict: 

185 raise ValueError( 

186 f"Indeterminate horizontal position at position {idx}, " 

187 f"{seq!r} (absolute column unknown)" 

188 ) 

189 current_col = target_col - 1 # HPA is 1-indexed, convert to 0-indexed 

190 # 2c. cursor forward, backward 

191 elif (cforward_n := m.group('cforward_n')) is not None: 

192 current_col += int(cforward_n) if cforward_n else 1 

193 elif (cbackward_n := m.group('cbackward_n')) is not None: 

194 n_backward = int(cbackward_n) if cbackward_n else 1 

195 if strict and n_backward > current_col: 

196 raise ValueError( 

197 f"Cursor left movement at position {idx} would move " 

198 f"{n_backward} cells left from column {current_col}, " 

199 f"exceeding string start" 

200 ) 

201 current_col = max(0, current_col - n_backward) 

202 # 2d. OSC 66 Text Sizing — has positive display width 

203 elif (ts_meta := m.group('ts_meta')) is not None: 

204 ts_text = m.group('ts_text') 

205 ts_term = m.group('ts_term') 

206 assert ts_text is not None and ts_term is not None 

207 text_size = TextSizing( 

208 TextSizingParams.from_params(ts_meta, control_codes=control_codes), 

209 ts_text, ts_term) 

210 current_col += text_size.display_width(ambiguous_width) 

211 # 2e. SGR and other zero-width sequences -- no column advance 

212 idx = m.end() 

213 # Escape sequences break VS16 adjacency: reset last-measured state 

214 last_measured_idx = -2 

215 last_measured_ucs = -1 

216 max_extent = max(max_extent, current_col) 

217 continue 

218 

219 # 2. Vertical or Illegal control characters zero width or error when 'strict' 

220 if char in ILLEGAL_CTRL: 

221 if strict: 

222 raise ValueError(f"Illegal control character {ord(char):#x} at position {idx}") 

223 idx += 1 

224 last_measured_idx = -2 

225 last_measured_ucs = -1 

226 continue 

227 

228 if char in VERTICAL_CTRL: 

229 if strict: 

230 raise ValueError(f"Vertical movement character {ord(char):#x} at position {idx}") 

231 idx += 1 

232 last_measured_idx = -2 

233 last_measured_ucs = -1 

234 continue 

235 

236 # 3. Horizontal movement characters 

237 if char in HORIZONTAL_CTRL: 

238 if char == '\t' and tabsize > 0: 

239 current_col += tabsize - (current_col % tabsize) 

240 elif char == '\b': 

241 if current_col > 0: 

242 current_col -= 1 

243 elif char == '\r': 

244 if strict: 

245 raise ValueError( 

246 f"Horizontal movement character \\r at position {idx}: " 

247 "indeterminate starting column" 

248 ) 

249 current_col = 0 

250 max_extent = max(max_extent, current_col) 

251 idx += 1 

252 last_measured_idx = -2 

253 last_measured_ucs = -1 

254 continue 

255 

256 # 4. Zero-width control characters 

257 if char in ZERO_WIDTH_CTRL: 

258 idx += 1 

259 last_measured_idx = -2 

260 last_measured_ucs = -1 

261 continue 

262 

263 # 5. Inline grapheme-clustering: ZWJ, VS16, Regional Indicators, 

264 # Fitzpatrick, Virama conjuncts, Mc, wcwidth 

265 ucs = ord(char) 

266 

267 # ZWJ (U+200D) 

268 if ucs == 0x200D: 

269 if last_was_virama: 

270 idx += 1 

271 elif idx + 1 < text_len: 

272 last_was_virama = False 

273 idx += 2 

274 else: 

275 last_was_virama = False 

276 idx += 1 

277 continue 

278 

279 # VS16 (U+FE0F): converts preceding narrow character to wide. 

280 if ucs == 0xFE0F and last_measured_idx >= 0: 

281 if bisearch(ord(text[last_measured_idx]), VS16_NARROW_TO_WIDE['9.0.0']): 

282 current_col += 1 

283 max_extent = max(max_extent, current_col) 

284 last_measured_idx = -2 # prevent double application 

285 idx += 1 

286 continue 

287 

288 # Regional Indicator & Fitzpatrick (both above BMP) 

289 if ucs > 0xFFFF: 

290 if ucs in _REGIONAL_INDICATOR_SET: 

291 ri_before = 0 

292 j = idx - 1 

293 while j >= 0 and ord(text[j]) in _REGIONAL_INDICATOR_SET: 

294 ri_before += 1 

295 j -= 1 

296 if ri_before % 2 == 1: 

297 last_measured_ucs = ucs 

298 idx += 1 

299 continue 

300 elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] 

301 and last_measured_ucs in _EMOJI_ZWJ_SET): 

302 idx += 1 

303 continue 

304 

305 # Virama conjunct formation 

306 if last_was_virama and bisearch(ucs, ISC_CONSONANT): 

307 last_measured_idx = idx 

308 last_measured_ucs = ucs 

309 last_was_virama = False 

310 conjunct_pending = True 

311 idx += 1 

312 continue 

313 

314 # Normal character: measure with wcwidth 

315 w = _wcwidth(char) 

316 if w > 0: 

317 if conjunct_pending: 

318 current_col += 1 

319 conjunct_pending = False 

320 current_col += w 

321 max_extent = max(max_extent, current_col) 

322 last_measured_idx = idx 

323 last_measured_ucs = ucs 

324 last_was_virama = False 

325 elif last_measured_idx >= 0 and bisearch(ucs, _CATEGORY_MC_TABLE): 

326 # Spacing Combining Mark (Mc) following a base character adds 1 

327 current_col += 1 

328 max_extent = max(max_extent, current_col) 

329 last_measured_idx = -2 

330 last_was_virama = False 

331 conjunct_pending = False 

332 else: 

333 last_was_virama = ucs in _ISC_VIRAMA_SET 

334 idx += 1 

335 

336 if conjunct_pending: 

337 current_col += 1 

338 max_extent = max(max_extent, current_col) 

339 return max_extent