Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/grapheme.py: 23%

1"""

2Grapheme cluster segmentation following Unicode Standard Annex #29.

4This module provides pure-Python implementation of the grapheme cluster boundary algorithm as

5defined in UAX #29: Unicode Text Segmentation.

7https://www.unicode.org/reports/tr29/

8"""

10from __future__ import annotations

12# std imports

13import sys

14import unicodedata

15from enum import IntEnum

16from functools import lru_cache

18from typing import TYPE_CHECKING, Optional, NamedTuple

20__lazy_modules__ = [

21 "wcwidth.bisearch",

22 "wcwidth.table_grapheme",

23]

24# local

25from .bisearch import bisearch as _bisearch

26from .table_grapheme import (GRAPHEME_L,

27 GRAPHEME_T,

28 GRAPHEME_V,

29 GRAPHEME_LV,

30 INCB_EXTEND,

31 INCB_LINKER,

32 GRAPHEME_LVT,

33 INCB_CONSONANT,

34 GRAPHEME_EXTEND,

35 GRAPHEME_CONTROL,

36 GRAPHEME_PREPEND,

37 GRAPHEME_SPACINGMARK,

38 EXTENDED_PICTOGRAPHIC,

39 GRAPHEME_REGIONAL_INDICATOR)

41if TYPE_CHECKING: # pragma: no cover

42 # std imports

43 from collections.abc import Iterator

45# check for python 3.15 for new iter_graphemes() function

46_HAS_PYTHON315_ITER_GRAPHEMES = (

47 sys.version_info >= (3, 15)

48 and hasattr(unicodedata, 'iter_graphemes')

49)

51# Maximum backward scan distance when finding grapheme cluster boundaries.

52# Covers all known Unicode grapheme clusters with margin; longer sequences are pathological.

53MAX_GRAPHEME_SCAN = 32

56class GCB(IntEnum):

57 """Grapheme Cluster Break property values."""

59 OTHER = 0

60 CR = 1

61 LF = 2

62 CONTROL = 3

63 EXTEND = 4

64 ZWJ = 5

65 REGIONAL_INDICATOR = 6

66 PREPEND = 7

67 SPACING_MARK = 8

68 L = 9

69 V = 10

70 T = 11

71 LV = 12

72 LVT = 13

75# All lru_cache sizes in this file use maxsize=1024, chosen by benchmarking UDHR data (500+

76# languages) and considering typical process-long sessions: western scripts need ~64 unique

77# codepoints, but CJK could reach ~2000 -- but likely not.

78@lru_cache(maxsize=1024)

79def _grapheme_cluster_break(ucs: int) -> GCB:

80 # pylint: disable=too-many-branches,too-complex

81 """Return the Grapheme_Cluster_Break property for a codepoint."""

82 # Single codepoint matches

83 if ucs == 0x000d:

84 return GCB.CR

85 if ucs == 0x000a:

86 return GCB.LF

87 if ucs == 0x200d:

88 return GCB.ZWJ

89 # Matching by codepoint ranges, requiring binary search

90 if _bisearch(ucs, GRAPHEME_CONTROL):

91 return GCB.CONTROL

92 if _bisearch(ucs, GRAPHEME_EXTEND):

93 return GCB.EXTEND

94 if _bisearch(ucs, GRAPHEME_REGIONAL_INDICATOR):

95 return GCB.REGIONAL_INDICATOR

96 if _bisearch(ucs, GRAPHEME_PREPEND):

97 return GCB.PREPEND

98 if _bisearch(ucs, GRAPHEME_SPACINGMARK):

99 return GCB.SPACING_MARK

100 if _bisearch(ucs, GRAPHEME_L):

101 return GCB.L

102 if _bisearch(ucs, GRAPHEME_V):

103 return GCB.V

104 if _bisearch(ucs, GRAPHEME_T):

105 return GCB.T

106 if _bisearch(ucs, GRAPHEME_LV):

107 return GCB.LV

108 if _bisearch(ucs, GRAPHEME_LVT):

109 return GCB.LVT

110 return GCB.OTHER

111

112

113@lru_cache(maxsize=1024)

114def _is_extended_pictographic(ucs: int) -> bool:

115 """Check if codepoint has Extended_Pictographic property."""

116 return bool(_bisearch(ucs, EXTENDED_PICTOGRAPHIC))

117

118

119@lru_cache(maxsize=1024)

120def _is_incb_linker(ucs: int) -> bool:

121 """Check if codepoint has InCB=Linker property."""

122 return bool(_bisearch(ucs, INCB_LINKER))

123

124

125@lru_cache(maxsize=1024)

126def _is_incb_consonant(ucs: int) -> bool:

127 """Check if codepoint has InCB=Consonant property."""

128 return bool(_bisearch(ucs, INCB_CONSONANT))

129

130

131@lru_cache(maxsize=1024)

132def _is_incb_extend(ucs: int) -> bool:

133 """Check if codepoint has InCB=Extend property."""

134 return bool(_bisearch(ucs, INCB_EXTEND))

135

136

137class BreakResult(NamedTuple):

138 """Result of grapheme cluster break decision."""

139

140 should_break: bool

141 ri_count: int

142

143

144@lru_cache(maxsize=1024)

145def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> Optional[BreakResult]:

146 """

147 Check simple GCB-pair-based break rules (cacheable).

148

149 Returns BreakResult for rules that can be determined from GCB properties alone, or None if

150 complex lookback rules (GB9c, GB11) need to be checked.

151 """

152 # GB3: CR x LF

153 if prev_gcb == GCB.CR and curr_gcb == GCB.LF:

154 return BreakResult(should_break=False, ri_count=0)

155

156 # GB4: (Control|CR|LF) ÷

157 if prev_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):

158 return BreakResult(should_break=True, ri_count=0)

159

160 # GB5: ÷ (Control|CR|LF)

161 if curr_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):

162 return BreakResult(should_break=True, ri_count=0)

163

164 # GB6: L x (L|V|LV|LVT)

165 if prev_gcb == GCB.L and curr_gcb in (GCB.L, GCB.V, GCB.LV, GCB.LVT):

166 return BreakResult(should_break=False, ri_count=0)

167

168 # GB7: (LV|V) x (V|T)

169 if prev_gcb in (GCB.LV, GCB.V) and curr_gcb in (GCB.V, GCB.T):

170 return BreakResult(should_break=False, ri_count=0)

171

172 # GB8: (LVT|T) x T

173 if prev_gcb in (GCB.LVT, GCB.T) and curr_gcb == GCB.T:

174 return BreakResult(should_break=False, ri_count=0)

175

176 # GB9: x (Extend|ZWJ) - but ZWJ needs GB11 check, so only handle Extend here

177 if curr_gcb == GCB.EXTEND:

178 return BreakResult(should_break=False, ri_count=0)

179

180 # GB9a: x SpacingMark

181 if curr_gcb == GCB.SPACING_MARK:

182 return BreakResult(should_break=False, ri_count=0)

183

184 # GB9b: Prepend x

185 if prev_gcb == GCB.PREPEND:

186 return BreakResult(should_break=False, ri_count=0)

187

188 # GB9c and GB11 need lookback - return None to signal complex check needed

189 # GB12/13 (RI pairs) need ri_count state - also handled in main function

190 return None

191

192

193def _should_break(

194 prev_gcb: GCB,

195 curr_gcb: GCB,

196 text: str,

197 curr_idx: int,

198 ri_count: int,

199) -> BreakResult:

200 # pylint: disable=too-many-branches,too-complex

201 """

202 Determine if there should be a grapheme cluster break between prev and curr.

203

204 Implements UAX #29 grapheme cluster boundary rules.

205 """

206 # Try cached simple rules first

207 result = _simple_break_check(prev_gcb, curr_gcb)

208 if result is not None:

209 return result

210

211 # GB9: x ZWJ (not cached because GB11 needs lookback when prev is ZWJ)

212 if curr_gcb == GCB.ZWJ:

213 return BreakResult(should_break=False, ri_count=0)

214

215 # GB9c: Indic conjunct cluster

216 # \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* \p{InCB=Linker}

217 # [\p{InCB=Extend}\p{InCB=Linker}]* x \p{InCB=Consonant}

218 curr_ucs = ord(text[curr_idx])

219 if _is_incb_consonant(curr_ucs):

220 has_linker = False

221 i = curr_idx - 1

222 while i >= 0:

223 prev_ucs = ord(text[i])

224 if _is_incb_linker(prev_ucs):

225 has_linker = True

226 i -= 1

227 elif _is_incb_extend(prev_ucs):

228 i -= 1

229 elif _is_incb_consonant(prev_ucs):

230 if has_linker:

231 return BreakResult(should_break=False, ri_count=0)

232 break

233 else:

234 break

235

236 # GB11: ExtPict Extend* ZWJ x ExtPict

237 if prev_gcb == GCB.ZWJ and _is_extended_pictographic(curr_ucs):

238 i = curr_idx - 2 # Skip the ZWJ at curr_idx - 1

239 while i >= 0:

240 prev_ucs = ord(text[i])

241 prev_prop = _grapheme_cluster_break(prev_ucs)

242 if prev_prop == GCB.EXTEND:

243 i -= 1

244 elif _is_extended_pictographic(prev_ucs):

245 return BreakResult(should_break=False, ri_count=0)

246 else:

247 break

248

249 # GB12/GB13: RI x RI (pair matching)

250 if prev_gcb == GCB.REGIONAL_INDICATOR and curr_gcb == GCB.REGIONAL_INDICATOR:

251 if ri_count % 2 == 1:

252 return BreakResult(should_break=False, ri_count=ri_count + 1)

253 return BreakResult(should_break=True, ri_count=1)

254

255 # GB999: Any ÷ Any

256 ri_count = 1 if curr_gcb == GCB.REGIONAL_INDICATOR else 0

257 return BreakResult(should_break=True, ri_count=ri_count)

258

259

260def _iter_graphemes_stdlib(

261 unistr: str,

262 start: int = 0,

263 end: Optional[int] = None,

264) -> Iterator[str]:

265 r"""

266 Iterate over grapheme clusters using :func:`unicodedata.iter_graphemes`.

267

268 Grapheme clusters are "user-perceived characters" - what a user would

269 consider a single character, which may consist of multiple Unicode

270 codepoints (e.g., a base character with combining marks, emoji sequences).

271

272 :param unistr: The Unicode string to segment.

273 :param start: Starting index (default 0).

274 :param end: Ending index (default len(unistr)).

275 :yields: Grapheme cluster substrings.

276

277 Example::

278

279 >>> list(iter_graphemes('cafe\u0301'))

280 ['c', 'a', 'f', 'e\u0301']

281 >>> list(iter_graphemes('ok\U0001F468\u200D\U0001F469\u200D\U0001F467'))

282 ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467']

283 >>> list(iter_graphemes('ok\U0001F1FA\U0001F1F8'))

284 ['o', 'k', '\U0001F1FA\U0001F1F8']

285

286 .. versionadded:: 0.3.0

287 """

288 if not unistr:

289 return

290

291 length = len(unistr)

292

293 if end is None:

294 end = length

295

296 if start >= end or start >= length:

297 return

298

299 end = min(end, length)

300

301 full_segment = unistr[start:end]

302 for seg in unicodedata.iter_graphemes(full_segment): # type: ignore[attr-defined] # pylint: disable=no-member

303 yield full_segment[seg.start:seg.end]

304

305

306def _iter_graphemes_python(

307 unistr: str,

308 start: int = 0,

309 end: int | None = None,

310) -> Iterator[str]:

311 r"""

312 Iterate over grapheme clusters using :func:`unicodedata.iter_graphemes`.

313

314 Grapheme clusters are "user-perceived characters" - what a user would

315 consider a single character, which may consist of multiple Unicode

316 codepoints (e.g., a base character with combining marks, emoji sequences).

317

318 :param unistr: The Unicode string to segment.

319 :param start: Starting index (default 0).

320 :param end: Ending index (default len(unistr)).

321 :yields: Grapheme cluster substrings.

322

323 Example::

324

325 >>> list(iter_graphemes('cafe\u0301'))

326 ['c', 'a', 'f', 'e\u0301']

327 >>> list(iter_graphemes('ok\U0001F468\u200D\U0001F469\u200D\U0001F467'))

328 ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467']

329 >>> list(iter_graphemes('ok\U0001F1FA\U0001F1F8'))

330 ['o', 'k', '\U0001F1FA\U0001F1F8']

331

332 .. versionadded:: 0.3.0

333 """

334 if not unistr:

335 return

336

337 length = len(unistr)

338

339 if end is None:

340 end = length

341

342 if start >= end or start >= length:

343 return

344

345 end = min(end, length)

346

347 # Track state for grapheme cluster boundaries

348 cluster_start = start

349 ri_count = 0

350

351 # Get GCB for first character

352 prev_gcb = _grapheme_cluster_break(ord(unistr[start]))

353

354 # Handle Regional Indicator count initialization

355 if prev_gcb == GCB.REGIONAL_INDICATOR:

356 ri_count = 1

357

358 for idx in range(start + 1, end):

359 curr_gcb = _grapheme_cluster_break(ord(unistr[idx]))

360

361 result = _should_break(prev_gcb, curr_gcb, unistr, idx, ri_count)

362 ri_count = result.ri_count

363

364 if result.should_break:

365 yield unistr[cluster_start:idx]

366 cluster_start = idx

367

368 prev_gcb = curr_gcb

369

370 # Yield the final cluster

371 yield unistr[cluster_start:end]

372

373

374def _find_cluster_start(text: str, pos: int) -> int:

375 """

376 Find the start of the grapheme cluster containing the character before pos.

377

378 Scans backwards from pos to find a safe starting point, then iterates forward using standard

379 break rules to find the actual cluster boundary.

380

381 :param text: The Unicode string.

382 :param pos: Position to search before (exclusive).

383 :returns: Start position of the grapheme cluster.

384 """

385 target_cp = ord(text[pos - 1])

386

387 # GB3: CR x LF - LF after CR is part of same cluster

388 if target_cp == 0x0A and pos >= 2 and text[pos - 2] == '\r':

389 return pos - 2

390

391 # Fast path: ASCII (except LF) starts its own cluster

392 if target_cp < 0x80:

393 # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic)

394 if pos >= 2 and target_cp >= 0x20:

395 prev_cp = ord(text[pos - 2])

396 if prev_cp >= 0x80 and _grapheme_cluster_break(prev_cp) == GCB.PREPEND:

397 return _find_cluster_start(text, pos - 1)

398 return pos - 1

399

400 # Scan backward to find a safe starting point

401 safe_start = pos - 1

402 while safe_start > 0 and (pos - safe_start) < MAX_GRAPHEME_SCAN:

403 cp = ord(text[safe_start])

404 if 0x20 <= cp < 0x80: # ASCII always starts a cluster

405 break

406 if _grapheme_cluster_break(cp) == GCB.CONTROL: # GB4

407 break

408 safe_start -= 1

409

410 # Verify forward to find the actual cluster boundary

411 cluster_start = safe_start

412 left_gcb = _grapheme_cluster_break(ord(text[safe_start]))

413 ri_count = 1 if left_gcb == GCB.REGIONAL_INDICATOR else 0

414

415 for i in range(safe_start + 1, pos):

416 right_gcb = _grapheme_cluster_break(ord(text[i]))

417 result = _should_break(left_gcb, right_gcb, text, i, ri_count)

418 ri_count = result.ri_count

419 if result.should_break:

420 cluster_start = i

421 left_gcb = right_gcb

422

423 return cluster_start

424

425

426def grapheme_boundary_before(unistr: str, pos: int) -> int:

427 r"""

428 Find the grapheme cluster boundary immediately before a position.

429

430 :param unistr: The Unicode string to search.

431 :param pos: Position in the string (0 < pos <= len(unistr)).

432 :returns: Start index of the grapheme cluster containing the character at pos-1.

433

434 Example::

435

436 >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8)

437 6

438 >>> grapheme_boundary_before('a\r\nb', 3)

439 1

440

441 .. versionadded:: 0.3.6

442 """

443 if pos <= 0:

444 return 0

445 return _find_cluster_start(unistr, min(pos, len(unistr)))

446

447

448def iter_graphemes_reverse(

449 unistr: str,

450 start: int = 0,

451 end: Optional[int] = None,

452) -> Iterator[str]:

453 r"""

454 Iterate over grapheme clusters in reverse order (last to first).

455

456 :param unistr: The Unicode string to segment.

457 :param start: Starting index (default 0).

458 :param end: Ending index (default len(unistr)).

459 :yields: Grapheme cluster substrings in reverse order.

460

461 Example::

462

463 >>> list(iter_graphemes_reverse('cafe\u0301'))

464 ['e\u0301', 'f', 'a', 'c']

465

466 .. versionadded:: 0.3.6

467 """

468 if not unistr:

469 return

470

471 length = len(unistr)

472

473 end = length if end is None else min(end, length)

474 start = max(start, 0)

475

476 if start >= end or start >= length:

477 return

478

479 pos = end

480 while pos > start:

481 cluster_start = _find_cluster_start(unistr, pos)

482 # Don't yield partial graphemes that extend before start

483 if cluster_start < start:

484 break

485 yield unistr[cluster_start:pos]

486 pos = cluster_start

487

488

489# Bind iter_graphemes at module level to avoid per-call dispatch overhead.

490iter_graphemes = (

491 _iter_graphemes_stdlib if _HAS_PYTHON315_ITER_GRAPHEMES

492 else _iter_graphemes_python

493)