Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/_wcswidth.py: 6%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

252 statements  

1"""This is a python implementation of wcswidth().""" 

2 

3from __future__ import annotations 

4 

5from typing import Optional 

6 

7__lazy_modules__ = [ 

8 "wcwidth._constants", 

9 "wcwidth._wcwidth", 

10 "wcwidth.bisearch", 

11 "wcwidth.table_grapheme", 

12 "wcwidth.table_vs16", 

13] 

14# local 

15from . import table_grapheme_overrides 

16from ._wcwidth import wcwidth 

17from .bisearch import bisearch 

18from ._constants import (_EMOJI_ZWJ_SET, 

19 _ISC_VIRAMA_SET, 

20 _CATEGORY_MC_TABLE, 

21 _FITZPATRICK_RANGE, 

22 _REGIONAL_INDICATOR_SET, 

23 resolve_terminal, 

24 get_term_overrides) 

25from .table_vs15 import VS15_WIDE_TO_NARROW 

26from .table_vs16 import VS16_NARROW_TO_WIDE 

27from .table_grapheme import GRAPHEME_EXTEND 

28 

29 

30def _scan_zwj_cluster_end(text: str, start: int, end: int) -> int: 

31 """ 

32 Scan forward from *start* (base character) to end of a ZWJ grapheme cluster. 

33 

34 Follows the UAX #29 GB11 pattern (ExtPict Extend* ZWJ x ExtPict) chained repeatedly until no 

35 more ZWJ joins are found. 

36 """ 

37 idx = start + 1 

38 # Skip Extend characters (Fitzpatrick modifiers, etc.) before first ZWJ 

39 while idx < end and bisearch(ord(text[idx]), GRAPHEME_EXTEND): 

40 idx += 1 

41 # Follow ZWJ chains 

42 while idx < end: 

43 if ord(text[idx]) != 0x200D: 

44 break 

45 idx += 1 

46 # GB11: \p{ExtPict} Extend* ZWJ × \p{ExtPict} 

47 # Extend modifiers (VS16, Fitzpatrick skin tones, etc.) attach to 

48 # the ExtPict *before* the ZWJ, not after it. After ZWJ the next 

49 # codepoint is always an ExtPict directly, no Extend skip needed. 

50 if idx < end and ord(text[idx]) in _EMOJI_ZWJ_SET: 

51 idx += 1 

52 # Skip trailing Extend (VS16, etc.) after ExtPict before next ZWJ 

53 while idx < end and bisearch(ord(text[idx]), GRAPHEME_EXTEND): 

54 idx += 1 

55 continue 

56 break 

57 return idx 

58 

59 

60def wcswidth( 

61 pwcs: str, 

62 n: Optional[int] = None, 

63 unicode_version: str = 'auto', 

64 ambiguous_width: int = 1, 

65) -> int: 

66 """ 

67 Given a unicode string, return its printable length on a terminal. 

68 

69 See :ref:`Specification` for details of cell measurement. 

70 

71 This implementation differs from Markus Khun's original POSIX C implementation, in that this 

72 ``wcswidth()`` processes graphemes strings yielded by :func:`wcwidth.iter_graphemes` defined by 

73 `Unicode Standard Annex #29`_. POSIX wcswidth(3) is not grapheme-aware and does not measure many 

74 kinds of Emojis or complex scripts correctly. 

75 

76 :param pwcs: Measure width of given unicode string. 

77 :param n: When ``n`` is None (default), return the length of the entire 

78 string, otherwise only the first ``n`` characters are measured. 

79 :param unicode_version: Ignored. Retained for backwards compatibility. 

80 

81 .. deprecated:: 0.3.0 

82 Only the latest Unicode version is now shipped. 

83 

84 :param ambiguous_width: Width to use for East Asian Ambiguous (A) 

85 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. 

86 :returns: The width, in cells, needed to display the first ``n`` characters 

87 of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control 

88 characters! 

89 

90 .. _`Unicode Standard Annex #29`: https://www.unicode.org/reports/tr29/ 

91 """ 

92 # pylint: disable=unused-argument,too-many-locals,too-many-statements,redefined-variable-type 

93 # pylint: disable=too-complex,too-many-branches,duplicate-code,too-many-nested-blocks 

94 

95 # Fast path: pure ASCII printable strings are always width == length 

96 if n is None and pwcs.isascii() and pwcs.isprintable(): 

97 return len(pwcs) 

98 

99 _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) 

100 

101 end = len(pwcs) if n is None else n 

102 total_width = 0 

103 idx = 0 

104 

105 last_measured_idx = -2 # -2 sentinel blocks VS16/VS15 (no base available) 

106 last_measured_ucs = -1 

107 last_measured_w = 0 

108 prev_was_virama = False 

109 cluster_width = 0 

110 vs16_nw_table = VS16_NARROW_TO_WIDE['9.0.0'] 

111 vs15_wn_table = VS15_WIDE_TO_NARROW['9.0.0'] 

112 _bisearch = bisearch 

113 

114 while idx < end: 

115 char = pwcs[idx] 

116 ucs = ord(char) 

117 

118 # 5. ZWJ (U+200D): consumed without contributing width. 

119 # Virama codepoints are treated as zero-width combining marks (Mn). When a 

120 # virama+consonant sequence forms a conjunct, its width is capped at 2 cells. 

121 

122 # ZWJ (U+200D) 

123 if ucs == 0x200D: 

124 if prev_was_virama: 

125 idx += 1 

126 elif idx + 1 < end: 

127 last_measured_w = 0 

128 prev_was_virama = False 

129 idx += 2 

130 else: 

131 prev_was_virama = False 

132 idx += 1 

133 continue 

134 

135 # 6. VS16 (U+FE0F): converts preceding narrow character to wide. 

136 if ucs == 0xFE0F and last_measured_idx >= 0: 

137 if _bisearch(last_measured_ucs, vs16_nw_table): 

138 cluster_width = 2 

139 last_measured_idx = -2 

140 idx += 1 

141 continue 

142 

143 # VS15 (U+FE0E): text variation selector, requests narrow presentation. 

144 if ucs == 0xFE0E and last_measured_idx >= 0: 

145 if bisearch(last_measured_ucs, vs15_wn_table) and last_measured_w == 2: 

146 total_width -= 1 

147 idx += 1 

148 continue 

149 

150 # 7. Regional Indicator & Fitzpatrick (both above BMP) 

151 if ucs > 0xFFFF: 

152 if ucs in _REGIONAL_INDICATOR_SET: 

153 ri_before = 0 

154 j = idx - 1 

155 while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: 

156 ri_before += 1 

157 j -= 1 

158 if ri_before % 2 == 1: 

159 last_measured_ucs = ucs 

160 idx += 1 

161 continue 

162 elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] 

163 and last_measured_ucs in _EMOJI_ZWJ_SET): 

164 idx += 1 

165 continue 

166 

167 # 8. Normal character: measure with wcwidth 

168 w = _wcwidth(char) 

169 if w < 0: 

170 return -1 

171 if w > 0: 

172 if prev_was_virama: 

173 cluster_width = 2 

174 elif cluster_width: 

175 total_width += cluster_width 

176 cluster_width = w 

177 else: 

178 cluster_width = w 

179 

180 last_measured_idx = idx 

181 last_measured_ucs = ucs 

182 last_measured_w = w 

183 prev_was_virama = False 

184 elif ucs in _ISC_VIRAMA_SET: 

185 prev_was_virama = True 

186 elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE): 

187 cluster_width = 2 

188 last_measured_idx = -2 

189 prev_was_virama = False 

190 else: 

191 prev_was_virama = False 

192 idx += 1 

193 

194 if cluster_width: 

195 total_width += cluster_width 

196 return total_width 

197 

198 

199def wcstwidth( 

200 pwcs: str, 

201 n: Optional[int] = None, 

202 unicode_version: str = 'auto', 

203 ambiguous_width: int = 1, 

204 term_program: bool | str = True, 

205) -> int: 

206 """ 

207 Given a unicode string, return its printable length on a terminal given by ``term_program``. 

208 

209 See :ref:`Specification` for details of cell measurement. 

210 

211 Unlike :func:`wcswidth`, this function applies per-terminal correction tables for 

212 emoji presentation and grapheme clusters. 

213 

214 :param pwcs: Measure width of given unicode string. 

215 :param n: When ``n`` is None (default), return the length of the entire 

216 string, otherwise only the first ``n`` characters are measured. 

217 :param unicode_version: Ignored. Retained for backwards compatibility. 

218 :param ambiguous_width: Width to use for East Asian Ambiguous (A) 

219 characters. Default is ``1`` (narrow). Set to ``2`` for CJK contexts. 

220 :param term_program: Terminal software identifier for table correction. 

221 ``True`` (default) reads the ``TERM_PROGRAM`` or ``TERM`` environment 

222 variable for auto-detection. ``False`` disables override lookup. 

223 Accepts a canonical terminal name matching :func:`list_term_programs`, 

224 such as from XTVERSION_, ENQ_, or ``TERM_PROGRAM``. 

225 

226 .. versionadded:: 0.8.0 

227 :returns: The width, in cells, needed to display the first ``n`` characters 

228 of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control 

229 characters! 

230 """ 

231 # pylint: disable=unused-argument,too-many-locals,too-many-statements,redefined-variable-type 

232 # pylint: disable=too-complex,too-many-branches,duplicate-code,too-many-nested-blocks 

233 # This function intentionally keeps all logic inline for performance. 

234 

235 # Fast path: pure ASCII printable strings are always width == length 

236 if n is None and pwcs.isascii() and pwcs.isprintable(): 

237 return len(pwcs) 

238 

239 # Resolve terminal software for override lookup 

240 term_canonical = resolve_terminal(term_program) 

241 

242 # Skip override lookup when no terminal detected (avoids lru_cache call overhead). 

243 # Extract locals for hot-loop performance (NamedTuple attribute access is slow). 

244 if term_canonical: 

245 overrides = get_term_overrides(term_canonical) 

246 _narrower = overrides.narrower 

247 _vs16_narrower = overrides.vs16_narrower 

248 _vs15_wider = overrides.vs15_wider 

249 _zeroer = overrides.zeroer 

250 _narrow_wider = overrides.narrow_wider 

251 _narrow_zeroer = overrides.narrow_zeroer 

252 _grapheme_overrides = table_grapheme_overrides.get(term_canonical) 

253 else: 

254 _narrower = () 

255 _vs16_narrower = () 

256 _vs15_wider = () 

257 _zeroer = () 

258 _narrow_wider = () 

259 _narrow_zeroer = () 

260 _grapheme_overrides = {} 

261 

262 # Select wcwidth call pattern for best lru_cache performance 

263 _wcwidth = wcwidth if ambiguous_width == 1 else lambda c: wcwidth(c, 'auto', ambiguous_width) 

264 

265 end = len(pwcs) if n is None else n 

266 total_width = 0 

267 idx = 0 

268 

269 # grapheme-clustering state and local re-binding for performance. 

270 # Widths accumulate in cluster_width and flush at boundaries. A cluster is a base character 

271 # plus combining marks, deferring the flush lets grapheme overrides replace the measured width 

272 # retrospectively. 

273 last_measured_idx = -2 # -2 sentinel blocks VS16/VS15 (no base available) 

274 last_measured_ucs = -1 

275 last_measured_w = 0 

276 prev_was_virama = False 

277 cluster_start = -1 

278 total_before_cluster = 0 

279 cluster_width = 0 

280 vs16_nw_table = VS16_NARROW_TO_WIDE['9.0.0'] 

281 vs15_wn_table = VS15_WIDE_TO_NARROW['9.0.0'] 

282 _bisearch = bisearch 

283 

284 while idx < end: 

285 char = pwcs[idx] 

286 ucs = ord(char) 

287 

288 # 

289 # Much of the logic below matches the logic in width(), but is repeated for improved 

290 # performance, they are given matching index reference numbers (starting at #5). 

291 # 

292 # 5. ZWJ (U+200D): consumed without contributing width. 

293 # Virama codepoints are treated as zero-width combining marks (Mn). When a 

294 # virama+consonant sequence forms a conjunct, its width is capped at 2 cells 

295 # matching behavior of popular terminals (PR #224) 

296 

297 # ZWJ (U+200D) 

298 if ucs == 0x200D: 

299 if prev_was_virama: 

300 idx += 1 

301 elif idx + 1 < end: 

302 # Check for terminal grapheme override when base char is ExtPict/RI 

303 if (_grapheme_overrides 

304 and last_measured_idx >= 0 

305 and last_measured_ucs in _EMOJI_ZWJ_SET): 

306 cluster_end = _scan_zwj_cluster_end(pwcs, last_measured_idx, end) 

307 cluster = pwcs[last_measured_idx:cluster_end] 

308 override_w = _grapheme_overrides.get(cluster) 

309 if override_w is not None: 

310 total_width += (override_w - last_measured_w) 

311 last_measured_idx = -2 

312 last_measured_ucs = -1 

313 last_measured_w = 0 

314 prev_was_virama = False 

315 cluster_start = -1 

316 idx = cluster_end 

317 continue 

318 # No override; ZWJ breaks VS adjacency. 

319 # VS16 already set last_measured_idx = -2, blocking further VS16. 

320 last_measured_w = 0 

321 prev_was_virama = False 

322 idx += 2 

323 else: 

324 prev_was_virama = False 

325 idx += 1 

326 continue 

327 

328 # 6. VS16 (U+FE0F): converts preceding narrow character to wide. 

329 if ucs == 0xFE0F and last_measured_idx >= 0: 

330 if _vs16_narrower and _bisearch(last_measured_ucs, _vs16_narrower): 

331 pass 

332 elif _bisearch(last_measured_ucs, vs16_nw_table): 

333 cluster_width = 2 

334 last_measured_idx = -2 # prevent double application 

335 idx += 1 

336 continue 

337 

338 # VS15 (U+FE0E): text variation selector, requests narrow presentation. 

339 if ucs == 0xFE0E and last_measured_idx >= 0: 

340 base_ucs = last_measured_ucs 

341 vs15_narrow = bisearch(base_ucs, vs15_wn_table) 

342 if _vs15_wider and bisearch(base_ucs, _vs15_wider): 

343 vs15_narrow = False 

344 if vs15_narrow and last_measured_w == 2: 

345 total_width -= 1 

346 idx += 1 

347 continue 

348 

349 # 7. Regional Indicator & Fitzpatrick (both above BMP) 

350 if ucs > 0xFFFF: 

351 if ucs in _REGIONAL_INDICATOR_SET: 

352 ri_before = 0 

353 j = idx - 1 

354 while j >= 0 and ord(pwcs[j]) in _REGIONAL_INDICATOR_SET: 

355 ri_before += 1 

356 j -= 1 

357 if ri_before % 2 == 1: 

358 last_measured_ucs = ucs 

359 idx += 1 

360 continue 

361 elif (_FITZPATRICK_RANGE[0] <= ucs <= _FITZPATRICK_RANGE[1] 

362 and last_measured_ucs in _EMOJI_ZWJ_SET): 

363 idx += 1 

364 continue 

365 

366 # 8. Normal character: measure with wcwidth 

367 w = _wcwidth(char) 

368 if w < 0: 

369 # C0/C1 control character 

370 return -1 

371 # Apply single-codepoint terminal overrides (pre-merged tuples) 

372 if w == 2 and _narrower and bisearch(ucs, _narrower): 

373 w = 1 

374 elif w == 2 and _zeroer and bisearch(ucs, _zeroer): 

375 w = 0 

376 if w == 1 and _narrow_wider and bisearch(ucs, _narrow_wider): 

377 w = 2 

378 elif w == 1 and _narrow_zeroer and bisearch(ucs, _narrow_zeroer): 

379 w = 0 

380 if w > 0: 

381 # virama+consonant extends current cluster; otherwise start new 

382 if prev_was_virama: 

383 cluster_width = 2 

384 elif cluster_width: 

385 # flush previous cluster, check for grapheme overrides 

386 flushed = False 

387 if _grapheme_overrides and cluster_start >= 0: 

388 # Two-phase override lookup: candidate (cluster+current) catches Lo+Lo pairs 

389 # where both chars bear width (Thai KO KAI + SARA AM). cluster_text (cluster 

390 # alone) catches C+Mc clusters where the override key is shorter. 

391 candidate = pwcs[cluster_start:idx + 1] 

392 override_w = _grapheme_overrides.get(candidate) 

393 if override_w is not None: 

394 total_width = total_before_cluster + override_w 

395 flushed = True 

396 cluster_width = 0 

397 else: 

398 cluster_text = pwcs[cluster_start:idx] 

399 override_w = _grapheme_overrides.get(cluster_text) 

400 if override_w is not None: 

401 total_width = total_before_cluster + override_w 

402 else: 

403 total_width += cluster_width 

404 else: 

405 total_width += cluster_width 

406 if not flushed: 

407 cluster_width = w 

408 cluster_start = idx 

409 total_before_cluster = total_width 

410 else: 

411 cluster_width = w 

412 cluster_start = idx 

413 total_before_cluster = total_width 

414 last_measured_idx = idx 

415 last_measured_ucs = ucs 

416 last_measured_w = w 

417 prev_was_virama = False 

418 elif ucs in _ISC_VIRAMA_SET: 

419 prev_was_virama = True 

420 elif last_measured_idx >= 0 and _bisearch(ucs, _CATEGORY_MC_TABLE): 

421 # Spacing Combining Mark (Mc) following a base character 

422 cluster_width = 2 

423 last_measured_idx = -2 

424 prev_was_virama = False 

425 else: 

426 prev_was_virama = False 

427 idx += 1 

428 

429 if cluster_width: 

430 if _grapheme_overrides and cluster_start >= 0: 

431 cluster_text = pwcs[cluster_start:end] 

432 override_w = _grapheme_overrides.get(cluster_text) 

433 if override_w is not None: 

434 total_width = total_before_cluster + override_w 

435 else: 

436 total_width += cluster_width 

437 else: 

438 total_width += cluster_width 

439 return total_width