Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/grapheme.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

214 statements  

1""" 

2Grapheme cluster segmentation following Unicode Standard Annex #29. 

3 

4This module provides pure-Python implementation of the grapheme cluster boundary algorithm as 

5defined in UAX #29: Unicode Text Segmentation. 

6 

7https://www.unicode.org/reports/tr29/ 

8""" 

9 

10from __future__ import annotations 

11 

12# std imports 

13import sys 

14import unicodedata 

15from enum import IntEnum 

16from functools import lru_cache 

17 

18from typing import TYPE_CHECKING, Optional, NamedTuple 

19 

20__lazy_modules__ = [ 

21 "wcwidth.bisearch", 

22 "wcwidth.table_grapheme", 

23] 

24# local 

25from .bisearch import bisearch as _bisearch 

26from .table_grapheme import (GRAPHEME_L, 

27 GRAPHEME_T, 

28 GRAPHEME_V, 

29 GRAPHEME_LV, 

30 INCB_EXTEND, 

31 INCB_LINKER, 

32 GRAPHEME_LVT, 

33 INCB_CONSONANT, 

34 GRAPHEME_EXTEND, 

35 GRAPHEME_CONTROL, 

36 GRAPHEME_PREPEND, 

37 GRAPHEME_SPACINGMARK, 

38 EXTENDED_PICTOGRAPHIC, 

39 GRAPHEME_REGIONAL_INDICATOR) 

40 

41if TYPE_CHECKING: # pragma: no cover 

42 # std imports 

43 from collections.abc import Iterator 

44 

45# check for python 3.15 for new iter_graphemes() function 

46_HAS_PYTHON315_ITER_GRAPHEMES = ( 

47 sys.version_info >= (3, 15) 

48 and hasattr(unicodedata, 'iter_graphemes') 

49) 

50 

51# Maximum backward scan distance when finding grapheme cluster boundaries. 

52# Covers all known Unicode grapheme clusters with margin; longer sequences are pathological. 

53MAX_GRAPHEME_SCAN = 32 

54 

55 

56class GCB(IntEnum): 

57 """Grapheme Cluster Break property values.""" 

58 

59 OTHER = 0 

60 CR = 1 

61 LF = 2 

62 CONTROL = 3 

63 EXTEND = 4 

64 ZWJ = 5 

65 REGIONAL_INDICATOR = 6 

66 PREPEND = 7 

67 SPACING_MARK = 8 

68 L = 9 

69 V = 10 

70 T = 11 

71 LV = 12 

72 LVT = 13 

73 

74 

75# All lru_cache sizes in this file use maxsize=1024, chosen by benchmarking UDHR data (500+ 

76# languages) and considering typical process-long sessions: western scripts need ~64 unique 

77# codepoints, but CJK could reach ~2000 -- but likely not. 

78@lru_cache(maxsize=1024) 

79def _grapheme_cluster_break(ucs: int) -> GCB: 

80 # pylint: disable=too-many-branches,too-complex 

81 """Return the Grapheme_Cluster_Break property for a codepoint.""" 

82 # Single codepoint matches 

83 if ucs == 0x000d: 

84 return GCB.CR 

85 if ucs == 0x000a: 

86 return GCB.LF 

87 if ucs == 0x200d: 

88 return GCB.ZWJ 

89 # Matching by codepoint ranges, requiring binary search 

90 if _bisearch(ucs, GRAPHEME_CONTROL): 

91 return GCB.CONTROL 

92 if _bisearch(ucs, GRAPHEME_EXTEND): 

93 return GCB.EXTEND 

94 if _bisearch(ucs, GRAPHEME_REGIONAL_INDICATOR): 

95 return GCB.REGIONAL_INDICATOR 

96 if _bisearch(ucs, GRAPHEME_PREPEND): 

97 return GCB.PREPEND 

98 if _bisearch(ucs, GRAPHEME_SPACINGMARK): 

99 return GCB.SPACING_MARK 

100 if _bisearch(ucs, GRAPHEME_L): 

101 return GCB.L 

102 if _bisearch(ucs, GRAPHEME_V): 

103 return GCB.V 

104 if _bisearch(ucs, GRAPHEME_T): 

105 return GCB.T 

106 if _bisearch(ucs, GRAPHEME_LV): 

107 return GCB.LV 

108 if _bisearch(ucs, GRAPHEME_LVT): 

109 return GCB.LVT 

110 return GCB.OTHER 

111 

112 

113@lru_cache(maxsize=1024) 

114def _is_extended_pictographic(ucs: int) -> bool: 

115 """Check if codepoint has Extended_Pictographic property.""" 

116 return bool(_bisearch(ucs, EXTENDED_PICTOGRAPHIC)) 

117 

118 

119@lru_cache(maxsize=1024) 

120def _is_incb_linker(ucs: int) -> bool: 

121 """Check if codepoint has InCB=Linker property.""" 

122 return bool(_bisearch(ucs, INCB_LINKER)) 

123 

124 

125@lru_cache(maxsize=1024) 

126def _is_incb_consonant(ucs: int) -> bool: 

127 """Check if codepoint has InCB=Consonant property.""" 

128 return bool(_bisearch(ucs, INCB_CONSONANT)) 

129 

130 

131@lru_cache(maxsize=1024) 

132def _is_incb_extend(ucs: int) -> bool: 

133 """Check if codepoint has InCB=Extend property.""" 

134 return bool(_bisearch(ucs, INCB_EXTEND)) 

135 

136 

137class BreakResult(NamedTuple): 

138 """Result of grapheme cluster break decision.""" 

139 

140 should_break: bool 

141 ri_count: int 

142 

143 

144@lru_cache(maxsize=1024) 

145def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> Optional[BreakResult]: 

146 """ 

147 Check simple GCB-pair-based break rules (cacheable). 

148 

149 Returns BreakResult for rules that can be determined from GCB properties alone, or None if 

150 complex lookback rules (GB9c, GB11) need to be checked. 

151 """ 

152 # GB3: CR x LF 

153 if prev_gcb == GCB.CR and curr_gcb == GCB.LF: 

154 return BreakResult(should_break=False, ri_count=0) 

155 

156 # GB4: (Control|CR|LF) ÷ 

157 if prev_gcb in (GCB.CONTROL, GCB.CR, GCB.LF): 

158 return BreakResult(should_break=True, ri_count=0) 

159 

160 # GB5: ÷ (Control|CR|LF) 

161 if curr_gcb in (GCB.CONTROL, GCB.CR, GCB.LF): 

162 return BreakResult(should_break=True, ri_count=0) 

163 

164 # GB6: L x (L|V|LV|LVT) 

165 if prev_gcb == GCB.L and curr_gcb in (GCB.L, GCB.V, GCB.LV, GCB.LVT): 

166 return BreakResult(should_break=False, ri_count=0) 

167 

168 # GB7: (LV|V) x (V|T) 

169 if prev_gcb in (GCB.LV, GCB.V) and curr_gcb in (GCB.V, GCB.T): 

170 return BreakResult(should_break=False, ri_count=0) 

171 

172 # GB8: (LVT|T) x T 

173 if prev_gcb in (GCB.LVT, GCB.T) and curr_gcb == GCB.T: 

174 return BreakResult(should_break=False, ri_count=0) 

175 

176 # GB9: x (Extend|ZWJ) - but ZWJ needs GB11 check, so only handle Extend here 

177 if curr_gcb == GCB.EXTEND: 

178 return BreakResult(should_break=False, ri_count=0) 

179 

180 # GB9a: x SpacingMark 

181 if curr_gcb == GCB.SPACING_MARK: 

182 return BreakResult(should_break=False, ri_count=0) 

183 

184 # GB9b: Prepend x 

185 if prev_gcb == GCB.PREPEND: 

186 return BreakResult(should_break=False, ri_count=0) 

187 

188 # GB9c and GB11 need lookback - return None to signal complex check needed 

189 # GB12/13 (RI pairs) need ri_count state - also handled in main function 

190 return None 

191 

192 

193def _should_break( 

194 prev_gcb: GCB, 

195 curr_gcb: GCB, 

196 text: str, 

197 curr_idx: int, 

198 ri_count: int, 

199) -> BreakResult: 

200 # pylint: disable=too-many-branches,too-complex 

201 """ 

202 Determine if there should be a grapheme cluster break between prev and curr. 

203 

204 Implements UAX #29 grapheme cluster boundary rules. 

205 """ 

206 # Try cached simple rules first 

207 result = _simple_break_check(prev_gcb, curr_gcb) 

208 if result is not None: 

209 return result 

210 

211 # GB9: x ZWJ (not cached because GB11 needs lookback when prev is ZWJ) 

212 if curr_gcb == GCB.ZWJ: 

213 return BreakResult(should_break=False, ri_count=0) 

214 

215 # GB9c: Indic conjunct cluster 

216 # \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* \p{InCB=Linker} 

217 # [\p{InCB=Extend}\p{InCB=Linker}]* x \p{InCB=Consonant} 

218 curr_ucs = ord(text[curr_idx]) 

219 if _is_incb_consonant(curr_ucs): 

220 has_linker = False 

221 i = curr_idx - 1 

222 while i >= 0: 

223 prev_ucs = ord(text[i]) 

224 if _is_incb_linker(prev_ucs): 

225 has_linker = True 

226 i -= 1 

227 elif _is_incb_extend(prev_ucs): 

228 i -= 1 

229 elif _is_incb_consonant(prev_ucs): 

230 if has_linker: 

231 return BreakResult(should_break=False, ri_count=0) 

232 break 

233 else: 

234 break 

235 

236 # GB11: ExtPict Extend* ZWJ x ExtPict 

237 if prev_gcb == GCB.ZWJ and _is_extended_pictographic(curr_ucs): 

238 i = curr_idx - 2 # Skip the ZWJ at curr_idx - 1 

239 while i >= 0: 

240 prev_ucs = ord(text[i]) 

241 prev_prop = _grapheme_cluster_break(prev_ucs) 

242 if prev_prop == GCB.EXTEND: 

243 i -= 1 

244 elif _is_extended_pictographic(prev_ucs): 

245 return BreakResult(should_break=False, ri_count=0) 

246 else: 

247 break 

248 

249 # GB12/GB13: RI x RI (pair matching) 

250 if prev_gcb == GCB.REGIONAL_INDICATOR and curr_gcb == GCB.REGIONAL_INDICATOR: 

251 if ri_count % 2 == 1: 

252 return BreakResult(should_break=False, ri_count=ri_count + 1) 

253 return BreakResult(should_break=True, ri_count=1) 

254 

255 # GB999: Any ÷ Any 

256 ri_count = 1 if curr_gcb == GCB.REGIONAL_INDICATOR else 0 

257 return BreakResult(should_break=True, ri_count=ri_count) 

258 

259 

260def _iter_graphemes_stdlib( 

261 unistr: str, 

262 start: int = 0, 

263 end: Optional[int] = None, 

264) -> Iterator[str]: 

265 r""" 

266 Iterate over grapheme clusters using :func:`unicodedata.iter_graphemes`. 

267 

268 Grapheme clusters are "user-perceived characters" - what a user would 

269 consider a single character, which may consist of multiple Unicode 

270 codepoints (e.g., a base character with combining marks, emoji sequences). 

271 

272 :param unistr: The Unicode string to segment. 

273 :param start: Starting index (default 0). 

274 :param end: Ending index (default len(unistr)). 

275 :yields: Grapheme cluster substrings. 

276 

277 Example:: 

278 

279 >>> list(iter_graphemes('cafe\u0301')) 

280 ['c', 'a', 'f', 'e\u0301'] 

281 >>> list(iter_graphemes('ok\U0001F468\u200D\U0001F469\u200D\U0001F467')) 

282 ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467'] 

283 >>> list(iter_graphemes('ok\U0001F1FA\U0001F1F8')) 

284 ['o', 'k', '\U0001F1FA\U0001F1F8'] 

285 

286 .. versionadded:: 0.3.0 

287 """ 

288 if not unistr: 

289 return 

290 

291 length = len(unistr) 

292 

293 if end is None: 

294 end = length 

295 

296 if start >= end or start >= length: 

297 return 

298 

299 end = min(end, length) 

300 

301 full_segment = unistr[start:end] 

302 for seg in unicodedata.iter_graphemes(full_segment): # type: ignore[attr-defined] # pylint: disable=no-member 

303 yield full_segment[seg.start:seg.end] 

304 

305 

306def _iter_graphemes_python( 

307 unistr: str, 

308 start: int = 0, 

309 end: int | None = None, 

310) -> Iterator[str]: 

311 r""" 

312 Iterate over grapheme clusters using :func:`unicodedata.iter_graphemes`. 

313 

314 Grapheme clusters are "user-perceived characters" - what a user would 

315 consider a single character, which may consist of multiple Unicode 

316 codepoints (e.g., a base character with combining marks, emoji sequences). 

317 

318 :param unistr: The Unicode string to segment. 

319 :param start: Starting index (default 0). 

320 :param end: Ending index (default len(unistr)). 

321 :yields: Grapheme cluster substrings. 

322 

323 Example:: 

324 

325 >>> list(iter_graphemes('cafe\u0301')) 

326 ['c', 'a', 'f', 'e\u0301'] 

327 >>> list(iter_graphemes('ok\U0001F468\u200D\U0001F469\u200D\U0001F467')) 

328 ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467'] 

329 >>> list(iter_graphemes('ok\U0001F1FA\U0001F1F8')) 

330 ['o', 'k', '\U0001F1FA\U0001F1F8'] 

331 

332 .. versionadded:: 0.3.0 

333 """ 

334 if not unistr: 

335 return 

336 

337 length = len(unistr) 

338 

339 if end is None: 

340 end = length 

341 

342 if start >= end or start >= length: 

343 return 

344 

345 end = min(end, length) 

346 

347 # Track state for grapheme cluster boundaries 

348 cluster_start = start 

349 ri_count = 0 

350 

351 # Get GCB for first character 

352 prev_gcb = _grapheme_cluster_break(ord(unistr[start])) 

353 

354 # Handle Regional Indicator count initialization 

355 if prev_gcb == GCB.REGIONAL_INDICATOR: 

356 ri_count = 1 

357 

358 for idx in range(start + 1, end): 

359 curr_gcb = _grapheme_cluster_break(ord(unistr[idx])) 

360 

361 result = _should_break(prev_gcb, curr_gcb, unistr, idx, ri_count) 

362 ri_count = result.ri_count 

363 

364 if result.should_break: 

365 yield unistr[cluster_start:idx] 

366 cluster_start = idx 

367 

368 prev_gcb = curr_gcb 

369 

370 # Yield the final cluster 

371 yield unistr[cluster_start:end] 

372 

373 

374def _find_cluster_start(text: str, pos: int) -> int: 

375 """ 

376 Find the start of the grapheme cluster containing the character before pos. 

377 

378 Scans backwards from pos to find a safe starting point, then iterates forward using standard 

379 break rules to find the actual cluster boundary. 

380 

381 :param text: The Unicode string. 

382 :param pos: Position to search before (exclusive). 

383 :returns: Start position of the grapheme cluster. 

384 """ 

385 target_cp = ord(text[pos - 1]) 

386 

387 # GB3: CR x LF - LF after CR is part of same cluster 

388 if target_cp == 0x0A and pos >= 2 and text[pos - 2] == '\r': 

389 return pos - 2 

390 

391 # Fast path: ASCII (except LF) starts its own cluster 

392 if target_cp < 0x80: 

393 # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic) 

394 if pos >= 2 and target_cp >= 0x20: 

395 prev_cp = ord(text[pos - 2]) 

396 if prev_cp >= 0x80 and _grapheme_cluster_break(prev_cp) == GCB.PREPEND: 

397 return _find_cluster_start(text, pos - 1) 

398 return pos - 1 

399 

400 # Scan backward to find a safe starting point 

401 safe_start = pos - 1 

402 while safe_start > 0 and (pos - safe_start) < MAX_GRAPHEME_SCAN: 

403 cp = ord(text[safe_start]) 

404 if 0x20 <= cp < 0x80: # ASCII always starts a cluster 

405 break 

406 if _grapheme_cluster_break(cp) == GCB.CONTROL: # GB4 

407 break 

408 safe_start -= 1 

409 

410 # Verify forward to find the actual cluster boundary 

411 cluster_start = safe_start 

412 left_gcb = _grapheme_cluster_break(ord(text[safe_start])) 

413 ri_count = 1 if left_gcb == GCB.REGIONAL_INDICATOR else 0 

414 

415 for i in range(safe_start + 1, pos): 

416 right_gcb = _grapheme_cluster_break(ord(text[i])) 

417 result = _should_break(left_gcb, right_gcb, text, i, ri_count) 

418 ri_count = result.ri_count 

419 if result.should_break: 

420 cluster_start = i 

421 left_gcb = right_gcb 

422 

423 return cluster_start 

424 

425 

426def grapheme_boundary_before(unistr: str, pos: int) -> int: 

427 r""" 

428 Find the grapheme cluster boundary immediately before a position. 

429 

430 :param unistr: The Unicode string to search. 

431 :param pos: Position in the string (0 < pos <= len(unistr)). 

432 :returns: Start index of the grapheme cluster containing the character at pos-1. 

433 

434 Example:: 

435 

436 >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8) 

437 6 

438 >>> grapheme_boundary_before('a\r\nb', 3) 

439 1 

440 

441 .. versionadded:: 0.3.6 

442 """ 

443 if pos <= 0: 

444 return 0 

445 return _find_cluster_start(unistr, min(pos, len(unistr))) 

446 

447 

448def iter_graphemes_reverse( 

449 unistr: str, 

450 start: int = 0, 

451 end: Optional[int] = None, 

452) -> Iterator[str]: 

453 r""" 

454 Iterate over grapheme clusters in reverse order (last to first). 

455 

456 :param unistr: The Unicode string to segment. 

457 :param start: Starting index (default 0). 

458 :param end: Ending index (default len(unistr)). 

459 :yields: Grapheme cluster substrings in reverse order. 

460 

461 Example:: 

462 

463 >>> list(iter_graphemes_reverse('cafe\u0301')) 

464 ['e\u0301', 'f', 'a', 'c'] 

465 

466 .. versionadded:: 0.3.6 

467 """ 

468 if not unistr: 

469 return 

470 

471 length = len(unistr) 

472 

473 end = length if end is None else min(end, length) 

474 start = max(start, 0) 

475 

476 if start >= end or start >= length: 

477 return 

478 

479 pos = end 

480 while pos > start: 

481 cluster_start = _find_cluster_start(unistr, pos) 

482 # Don't yield partial graphemes that extend before start 

483 if cluster_start < start: 

484 break 

485 yield unistr[cluster_start:pos] 

486 pos = cluster_start 

487 

488 

489# Bind iter_graphemes at module level to avoid per-call dispatch overhead. 

490iter_graphemes = ( 

491 _iter_graphemes_stdlib if _HAS_PYTHON315_ITER_GRAPHEMES 

492 else _iter_graphemes_python 

493)