Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/grapheme.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

197 statements  

1""" 

2Grapheme cluster segmentation following Unicode Standard Annex #29. 

3 

4This module provides pure-Python implementation of the grapheme cluster boundary algorithm as 

5defined in UAX #29: Unicode Text Segmentation. 

6 

7https://www.unicode.org/reports/tr29/ 

8""" 

9 

10from __future__ import annotations 

11 

12# std imports 

13from enum import IntEnum 

14from functools import lru_cache 

15 

16from typing import TYPE_CHECKING, NamedTuple 

17 

18# local 

19from .bisearch import bisearch as _bisearch 

20from .table_grapheme import (GRAPHEME_L, 

21 GRAPHEME_T, 

22 GRAPHEME_V, 

23 GRAPHEME_LV, 

24 INCB_EXTEND, 

25 INCB_LINKER, 

26 GRAPHEME_LVT, 

27 INCB_CONSONANT, 

28 GRAPHEME_EXTEND, 

29 GRAPHEME_CONTROL, 

30 GRAPHEME_PREPEND, 

31 GRAPHEME_SPACINGMARK, 

32 EXTENDED_PICTOGRAPHIC, 

33 GRAPHEME_REGIONAL_INDICATOR) 

34 

35if TYPE_CHECKING: # pragma: no cover 

36 # std imports 

37 from collections.abc import Iterator 

38 

39# Maximum backward scan distance when finding grapheme cluster boundaries. 

40# Covers all known Unicode grapheme clusters with margin; longer sequences are pathological. 

41MAX_GRAPHEME_SCAN = 32 

42 

43 

44class GCB(IntEnum): 

45 """Grapheme Cluster Break property values.""" 

46 

47 OTHER = 0 

48 CR = 1 

49 LF = 2 

50 CONTROL = 3 

51 EXTEND = 4 

52 ZWJ = 5 

53 REGIONAL_INDICATOR = 6 

54 PREPEND = 7 

55 SPACING_MARK = 8 

56 L = 9 

57 V = 10 

58 T = 11 

59 LV = 12 

60 LVT = 13 

61 

62 

63# All lru_cache sizes in this file use maxsize=1024, chosen by benchmarking UDHR data (500+ 

64# languages) and considering typical process-long sessions: western scripts need ~64 unique 

65# codepoints, but CJK could reach ~2000 -- but likely not. 

66@lru_cache(maxsize=1024) 

67def _grapheme_cluster_break(ucs: int) -> GCB: 

68 # pylint: disable=too-many-branches,too-complex 

69 """Return the Grapheme_Cluster_Break property for a codepoint.""" 

70 # Single codepoint matches 

71 if ucs == 0x000d: 

72 return GCB.CR 

73 if ucs == 0x000a: 

74 return GCB.LF 

75 if ucs == 0x200d: 

76 return GCB.ZWJ 

77 # Matching by codepoint ranges, requiring binary search 

78 if _bisearch(ucs, GRAPHEME_CONTROL): 

79 return GCB.CONTROL 

80 if _bisearch(ucs, GRAPHEME_EXTEND): 

81 return GCB.EXTEND 

82 if _bisearch(ucs, GRAPHEME_REGIONAL_INDICATOR): 

83 return GCB.REGIONAL_INDICATOR 

84 if _bisearch(ucs, GRAPHEME_PREPEND): 

85 return GCB.PREPEND 

86 if _bisearch(ucs, GRAPHEME_SPACINGMARK): 

87 return GCB.SPACING_MARK 

88 if _bisearch(ucs, GRAPHEME_L): 

89 return GCB.L 

90 if _bisearch(ucs, GRAPHEME_V): 

91 return GCB.V 

92 if _bisearch(ucs, GRAPHEME_T): 

93 return GCB.T 

94 if _bisearch(ucs, GRAPHEME_LV): 

95 return GCB.LV 

96 if _bisearch(ucs, GRAPHEME_LVT): 

97 return GCB.LVT 

98 return GCB.OTHER 

99 

100 

101@lru_cache(maxsize=1024) 

102def _is_extended_pictographic(ucs: int) -> bool: 

103 """Check if codepoint has Extended_Pictographic property.""" 

104 return bool(_bisearch(ucs, EXTENDED_PICTOGRAPHIC)) 

105 

106 

107@lru_cache(maxsize=1024) 

108def _is_incb_linker(ucs: int) -> bool: 

109 """Check if codepoint has InCB=Linker property.""" 

110 return bool(_bisearch(ucs, INCB_LINKER)) 

111 

112 

113@lru_cache(maxsize=1024) 

114def _is_incb_consonant(ucs: int) -> bool: 

115 """Check if codepoint has InCB=Consonant property.""" 

116 return bool(_bisearch(ucs, INCB_CONSONANT)) 

117 

118 

119@lru_cache(maxsize=1024) 

120def _is_incb_extend(ucs: int) -> bool: 

121 """Check if codepoint has InCB=Extend property.""" 

122 return bool(_bisearch(ucs, INCB_EXTEND)) 

123 

124 

125class BreakResult(NamedTuple): 

126 """Result of grapheme cluster break decision.""" 

127 

128 should_break: bool 

129 ri_count: int 

130 

131 

132@lru_cache(maxsize=1024) 

133def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> BreakResult | None: 

134 """ 

135 Check simple GCB-pair-based break rules (cacheable). 

136 

137 Returns BreakResult for rules that can be determined from GCB properties alone, or None if 

138 complex lookback rules (GB9c, GB11) need to be checked. 

139 """ 

140 # GB3: CR x LF 

141 if prev_gcb == GCB.CR and curr_gcb == GCB.LF: 

142 return BreakResult(should_break=False, ri_count=0) 

143 

144 # GB4: (Control|CR|LF) ÷ 

145 if prev_gcb in (GCB.CONTROL, GCB.CR, GCB.LF): 

146 return BreakResult(should_break=True, ri_count=0) 

147 

148 # GB5: ÷ (Control|CR|LF) 

149 if curr_gcb in (GCB.CONTROL, GCB.CR, GCB.LF): 

150 return BreakResult(should_break=True, ri_count=0) 

151 

152 # GB6: L x (L|V|LV|LVT) 

153 if prev_gcb == GCB.L and curr_gcb in (GCB.L, GCB.V, GCB.LV, GCB.LVT): 

154 return BreakResult(should_break=False, ri_count=0) 

155 

156 # GB7: (LV|V) x (V|T) 

157 if prev_gcb in (GCB.LV, GCB.V) and curr_gcb in (GCB.V, GCB.T): 

158 return BreakResult(should_break=False, ri_count=0) 

159 

160 # GB8: (LVT|T) x T 

161 if prev_gcb in (GCB.LVT, GCB.T) and curr_gcb == GCB.T: 

162 return BreakResult(should_break=False, ri_count=0) 

163 

164 # GB9: x (Extend|ZWJ) - but ZWJ needs GB11 check, so only handle Extend here 

165 if curr_gcb == GCB.EXTEND: 

166 return BreakResult(should_break=False, ri_count=0) 

167 

168 # GB9a: x SpacingMark 

169 if curr_gcb == GCB.SPACING_MARK: 

170 return BreakResult(should_break=False, ri_count=0) 

171 

172 # GB9b: Prepend x 

173 if prev_gcb == GCB.PREPEND: 

174 return BreakResult(should_break=False, ri_count=0) 

175 

176 # GB9c and GB11 need lookback - return None to signal complex check needed 

177 # GB12/13 (RI pairs) need ri_count state - also handled in main function 

178 return None 

179 

180 

181def _should_break( 

182 prev_gcb: GCB, 

183 curr_gcb: GCB, 

184 text: str, 

185 curr_idx: int, 

186 ri_count: int, 

187) -> BreakResult: 

188 # pylint: disable=too-many-branches,too-complex 

189 """ 

190 Determine if there should be a grapheme cluster break between prev and curr. 

191 

192 Implements UAX #29 grapheme cluster boundary rules. 

193 """ 

194 # Try cached simple rules first 

195 result = _simple_break_check(prev_gcb, curr_gcb) 

196 if result is not None: 

197 return result 

198 

199 # GB9: x ZWJ (not cached because GB11 needs lookback when prev is ZWJ) 

200 if curr_gcb == GCB.ZWJ: 

201 return BreakResult(should_break=False, ri_count=0) 

202 

203 # GB9c: Indic conjunct cluster 

204 # \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* \p{InCB=Linker} 

205 # [\p{InCB=Extend}\p{InCB=Linker}]* x \p{InCB=Consonant} 

206 curr_ucs = ord(text[curr_idx]) 

207 if _is_incb_consonant(curr_ucs): 

208 has_linker = False 

209 i = curr_idx - 1 

210 while i >= 0: 

211 prev_ucs = ord(text[i]) 

212 if _is_incb_linker(prev_ucs): 

213 has_linker = True 

214 i -= 1 

215 elif _is_incb_extend(prev_ucs): 

216 i -= 1 

217 elif _is_incb_consonant(prev_ucs): 

218 if has_linker: 

219 return BreakResult(should_break=False, ri_count=0) 

220 break 

221 else: 

222 break 

223 

224 # GB11: ExtPict Extend* ZWJ x ExtPict 

225 if prev_gcb == GCB.ZWJ and _is_extended_pictographic(curr_ucs): 

226 i = curr_idx - 2 # Skip the ZWJ at curr_idx - 1 

227 while i >= 0: 

228 prev_ucs = ord(text[i]) 

229 prev_prop = _grapheme_cluster_break(prev_ucs) 

230 if prev_prop == GCB.EXTEND: 

231 i -= 1 

232 elif _is_extended_pictographic(prev_ucs): 

233 return BreakResult(should_break=False, ri_count=0) 

234 else: 

235 break 

236 

237 # GB12/GB13: RI x RI (pair matching) 

238 if prev_gcb == GCB.REGIONAL_INDICATOR and curr_gcb == GCB.REGIONAL_INDICATOR: 

239 if ri_count % 2 == 1: 

240 return BreakResult(should_break=False, ri_count=ri_count + 1) 

241 return BreakResult(should_break=True, ri_count=1) 

242 

243 # GB999: Any ÷ Any 

244 ri_count = 1 if curr_gcb == GCB.REGIONAL_INDICATOR else 0 

245 return BreakResult(should_break=True, ri_count=ri_count) 

246 

247 

248def iter_graphemes( 

249 unistr: str, 

250 start: int = 0, 

251 end: int | None = None, 

252) -> Iterator[str]: 

253 r""" 

254 Iterate over grapheme clusters in a Unicode string. 

255 

256 Grapheme clusters are "user-perceived characters" - what a user would 

257 consider a single character, which may consist of multiple Unicode 

258 codepoints (e.g., a base character with combining marks, emoji sequences). 

259 

260 :param unistr: The Unicode string to segment. 

261 :param start: Starting index (default 0). 

262 :param end: Ending index (default len(unistr)). 

263 :yields: Grapheme cluster substrings. 

264 

265 Example:: 

266 

267 >>> list(iter_graphemes('cafe\u0301')) 

268 ['c', 'a', 'f', 'e\u0301'] 

269 >>> list(iter_graphemes('\U0001F468\u200D\U0001F469\u200D\U0001F467')) 

270 ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467'] 

271 >>> list(iter_graphemes('\U0001F1FA\U0001F1F8')) 

272 ['o', 'k', '\U0001F1FA\U0001F1F8'] 

273 

274 .. versionadded:: 0.3.0 

275 """ 

276 if not unistr: 

277 return 

278 

279 length = len(unistr) 

280 

281 if end is None: 

282 end = length 

283 

284 if start >= end or start >= length: 

285 return 

286 

287 end = min(end, length) 

288 

289 # Track state for grapheme cluster boundaries 

290 cluster_start = start 

291 ri_count = 0 

292 

293 # Get GCB for first character 

294 prev_gcb = _grapheme_cluster_break(ord(unistr[start])) 

295 

296 # Handle Regional Indicator count initialization 

297 if prev_gcb == GCB.REGIONAL_INDICATOR: 

298 ri_count = 1 

299 

300 for idx in range(start + 1, end): 

301 curr_gcb = _grapheme_cluster_break(ord(unistr[idx])) 

302 

303 result = _should_break(prev_gcb, curr_gcb, unistr, idx, ri_count) 

304 ri_count = result.ri_count 

305 

306 if result.should_break: 

307 yield unistr[cluster_start:idx] 

308 cluster_start = idx 

309 

310 prev_gcb = curr_gcb 

311 

312 # Yield the final cluster 

313 yield unistr[cluster_start:end] 

314 

315 

316def _find_cluster_start(text: str, pos: int) -> int: 

317 """ 

318 Find the start of the grapheme cluster containing the character before pos. 

319 

320 Scans backwards from pos to find a safe starting point, then iterates forward using standard 

321 break rules to find the actual cluster boundary. 

322 

323 :param text: The Unicode string. 

324 :param pos: Position to search before (exclusive). 

325 :returns: Start position of the grapheme cluster. 

326 """ 

327 target_cp = ord(text[pos - 1]) 

328 

329 # GB3: CR x LF - LF after CR is part of same cluster 

330 if target_cp == 0x0A and pos >= 2 and text[pos - 2] == '\r': 

331 return pos - 2 

332 

333 # Fast path: ASCII (except LF) starts its own cluster 

334 if target_cp < 0x80: 

335 # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic) 

336 if pos >= 2 and target_cp >= 0x20: 

337 prev_cp = ord(text[pos - 2]) 

338 if prev_cp >= 0x80 and _grapheme_cluster_break(prev_cp) == GCB.PREPEND: 

339 return _find_cluster_start(text, pos - 1) 

340 return pos - 1 

341 

342 # Scan backward to find a safe starting point 

343 safe_start = pos - 1 

344 while safe_start > 0 and (pos - safe_start) < MAX_GRAPHEME_SCAN: 

345 cp = ord(text[safe_start]) 

346 if 0x20 <= cp < 0x80: # ASCII always starts a cluster 

347 break 

348 if _grapheme_cluster_break(cp) == GCB.CONTROL: # GB4 

349 break 

350 safe_start -= 1 

351 

352 # Verify forward to find the actual cluster boundary 

353 cluster_start = safe_start 

354 left_gcb = _grapheme_cluster_break(ord(text[safe_start])) 

355 ri_count = 1 if left_gcb == GCB.REGIONAL_INDICATOR else 0 

356 

357 for i in range(safe_start + 1, pos): 

358 right_gcb = _grapheme_cluster_break(ord(text[i])) 

359 result = _should_break(left_gcb, right_gcb, text, i, ri_count) 

360 ri_count = result.ri_count 

361 if result.should_break: 

362 cluster_start = i 

363 left_gcb = right_gcb 

364 

365 return cluster_start 

366 

367 

368def grapheme_boundary_before(unistr: str, pos: int) -> int: 

369 r""" 

370 Find the grapheme cluster boundary immediately before a position. 

371 

372 :param unistr: The Unicode string to search. 

373 :param pos: Position in the string (0 < pos <= len(unistr)). 

374 :returns: Start index of the grapheme cluster containing the character at pos-1. 

375 

376 Example:: 

377 

378 >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8) 

379 6 

380 >>> grapheme_boundary_before('a\r\nb', 3) 

381 1 

382 

383 .. versionadded:: 0.3.6 

384 """ 

385 if pos <= 0: 

386 return 0 

387 return _find_cluster_start(unistr, min(pos, len(unistr))) 

388 

389 

390def iter_graphemes_reverse( 

391 unistr: str, 

392 start: int = 0, 

393 end: int | None = None, 

394) -> Iterator[str]: 

395 r""" 

396 Iterate over grapheme clusters in reverse order (last to first). 

397 

398 :param unistr: The Unicode string to segment. 

399 :param start: Starting index (default 0). 

400 :param end: Ending index (default len(unistr)). 

401 :yields: Grapheme cluster substrings in reverse order. 

402 

403 Example:: 

404 

405 >>> list(iter_graphemes_reverse('cafe\u0301')) 

406 ['e\u0301', 'f', 'a', 'c'] 

407 

408 .. versionadded:: 0.3.6 

409 """ 

410 if not unistr: 

411 return 

412 

413 length = len(unistr) 

414 

415 end = length if end is None else min(end, length) 

416 start = max(start, 0) 

417 

418 if start >= end or start >= length: 

419 return 

420 

421 pos = end 

422 while pos > start: 

423 cluster_start = _find_cluster_start(unistr, pos) 

424 # Don't yield partial graphemes that extend before start 

425 if cluster_start < start: 

426 break 

427 yield unistr[cluster_start:pos] 

428 pos = cluster_start