Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/wcwidth/grapheme.py: 22%

1"""

2Grapheme cluster segmentation following Unicode Standard Annex #29.

4This module provides pure-Python implementation of the grapheme cluster boundary algorithm as

5defined in UAX #29: Unicode Text Segmentation.

7https://www.unicode.org/reports/tr29/

8"""

10from __future__ import annotations

12# std imports

13from enum import IntEnum

14from functools import lru_cache

16from typing import TYPE_CHECKING, NamedTuple

18# local

19from .bisearch import bisearch as _bisearch

20from .table_grapheme import (GRAPHEME_L,

21 GRAPHEME_T,

22 GRAPHEME_V,

23 GRAPHEME_LV,

24 INCB_EXTEND,

25 INCB_LINKER,

26 GRAPHEME_LVT,

27 INCB_CONSONANT,

28 GRAPHEME_EXTEND,

29 GRAPHEME_CONTROL,

30 GRAPHEME_PREPEND,

31 GRAPHEME_SPACINGMARK,

32 EXTENDED_PICTOGRAPHIC,

33 GRAPHEME_REGIONAL_INDICATOR)

35if TYPE_CHECKING: # pragma: no cover

36 # std imports

37 from collections.abc import Iterator

39# Maximum backward scan distance when finding grapheme cluster boundaries.

40# Covers all known Unicode grapheme clusters with margin; longer sequences are pathological.

41MAX_GRAPHEME_SCAN = 32

44class GCB(IntEnum):

45 """Grapheme Cluster Break property values."""

47 OTHER = 0

48 CR = 1

49 LF = 2

50 CONTROL = 3

51 EXTEND = 4

52 ZWJ = 5

53 REGIONAL_INDICATOR = 6

54 PREPEND = 7

55 SPACING_MARK = 8

56 L = 9

57 V = 10

58 T = 11

59 LV = 12

60 LVT = 13

63# All lru_cache sizes in this file use maxsize=1024, chosen by benchmarking UDHR data (500+

64# languages) and considering typical process-long sessions: western scripts need ~64 unique

65# codepoints, but CJK could reach ~2000 -- but likely not.

66@lru_cache(maxsize=1024)

67def _grapheme_cluster_break(ucs: int) -> GCB:

68 # pylint: disable=too-many-branches,too-complex

69 """Return the Grapheme_Cluster_Break property for a codepoint."""

70 # Single codepoint matches

71 if ucs == 0x000d:

72 return GCB.CR

73 if ucs == 0x000a:

74 return GCB.LF

75 if ucs == 0x200d:

76 return GCB.ZWJ

77 # Matching by codepoint ranges, requiring binary search

78 if _bisearch(ucs, GRAPHEME_CONTROL):

79 return GCB.CONTROL

80 if _bisearch(ucs, GRAPHEME_EXTEND):

81 return GCB.EXTEND

82 if _bisearch(ucs, GRAPHEME_REGIONAL_INDICATOR):

83 return GCB.REGIONAL_INDICATOR

84 if _bisearch(ucs, GRAPHEME_PREPEND):

85 return GCB.PREPEND

86 if _bisearch(ucs, GRAPHEME_SPACINGMARK):

87 return GCB.SPACING_MARK

88 if _bisearch(ucs, GRAPHEME_L):

89 return GCB.L

90 if _bisearch(ucs, GRAPHEME_V):

91 return GCB.V

92 if _bisearch(ucs, GRAPHEME_T):

93 return GCB.T

94 if _bisearch(ucs, GRAPHEME_LV):

95 return GCB.LV

96 if _bisearch(ucs, GRAPHEME_LVT):

97 return GCB.LVT

98 return GCB.OTHER

100

101@lru_cache(maxsize=1024)

102def _is_extended_pictographic(ucs: int) -> bool:

103 """Check if codepoint has Extended_Pictographic property."""

104 return bool(_bisearch(ucs, EXTENDED_PICTOGRAPHIC))

105

106

107@lru_cache(maxsize=1024)

108def _is_incb_linker(ucs: int) -> bool:

109 """Check if codepoint has InCB=Linker property."""

110 return bool(_bisearch(ucs, INCB_LINKER))

111

112

113@lru_cache(maxsize=1024)

114def _is_incb_consonant(ucs: int) -> bool:

115 """Check if codepoint has InCB=Consonant property."""

116 return bool(_bisearch(ucs, INCB_CONSONANT))

117

118

119@lru_cache(maxsize=1024)

120def _is_incb_extend(ucs: int) -> bool:

121 """Check if codepoint has InCB=Extend property."""

122 return bool(_bisearch(ucs, INCB_EXTEND))

123

124

125class BreakResult(NamedTuple):

126 """Result of grapheme cluster break decision."""

127

128 should_break: bool

129 ri_count: int

130

131

132@lru_cache(maxsize=1024)

133def _simple_break_check(prev_gcb: GCB, curr_gcb: GCB) -> BreakResult | None:

134 """

135 Check simple GCB-pair-based break rules (cacheable).

136

137 Returns BreakResult for rules that can be determined from GCB properties alone, or None if

138 complex lookback rules (GB9c, GB11) need to be checked.

139 """

140 # GB3: CR x LF

141 if prev_gcb == GCB.CR and curr_gcb == GCB.LF:

142 return BreakResult(should_break=False, ri_count=0)

143

144 # GB4: (Control|CR|LF) ÷

145 if prev_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):

146 return BreakResult(should_break=True, ri_count=0)

147

148 # GB5: ÷ (Control|CR|LF)

149 if curr_gcb in (GCB.CONTROL, GCB.CR, GCB.LF):

150 return BreakResult(should_break=True, ri_count=0)

151

152 # GB6: L x (L|V|LV|LVT)

153 if prev_gcb == GCB.L and curr_gcb in (GCB.L, GCB.V, GCB.LV, GCB.LVT):

154 return BreakResult(should_break=False, ri_count=0)

155

156 # GB7: (LV|V) x (V|T)

157 if prev_gcb in (GCB.LV, GCB.V) and curr_gcb in (GCB.V, GCB.T):

158 return BreakResult(should_break=False, ri_count=0)

159

160 # GB8: (LVT|T) x T

161 if prev_gcb in (GCB.LVT, GCB.T) and curr_gcb == GCB.T:

162 return BreakResult(should_break=False, ri_count=0)

163

164 # GB9: x (Extend|ZWJ) - but ZWJ needs GB11 check, so only handle Extend here

165 if curr_gcb == GCB.EXTEND:

166 return BreakResult(should_break=False, ri_count=0)

167

168 # GB9a: x SpacingMark

169 if curr_gcb == GCB.SPACING_MARK:

170 return BreakResult(should_break=False, ri_count=0)

171

172 # GB9b: Prepend x

173 if prev_gcb == GCB.PREPEND:

174 return BreakResult(should_break=False, ri_count=0)

175

176 # GB9c and GB11 need lookback - return None to signal complex check needed

177 # GB12/13 (RI pairs) need ri_count state - also handled in main function

178 return None

179

180

181def _should_break(

182 prev_gcb: GCB,

183 curr_gcb: GCB,

184 text: str,

185 curr_idx: int,

186 ri_count: int,

187) -> BreakResult:

188 # pylint: disable=too-many-branches,too-complex

189 """

190 Determine if there should be a grapheme cluster break between prev and curr.

191

192 Implements UAX #29 grapheme cluster boundary rules.

193 """

194 # Try cached simple rules first

195 result = _simple_break_check(prev_gcb, curr_gcb)

196 if result is not None:

197 return result

198

199 # GB9: x ZWJ (not cached because GB11 needs lookback when prev is ZWJ)

200 if curr_gcb == GCB.ZWJ:

201 return BreakResult(should_break=False, ri_count=0)

202

203 # GB9c: Indic conjunct cluster

204 # \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]* \p{InCB=Linker}

205 # [\p{InCB=Extend}\p{InCB=Linker}]* x \p{InCB=Consonant}

206 curr_ucs = ord(text[curr_idx])

207 if _is_incb_consonant(curr_ucs):

208 has_linker = False

209 i = curr_idx - 1

210 while i >= 0:

211 prev_ucs = ord(text[i])

212 if _is_incb_linker(prev_ucs):

213 has_linker = True

214 i -= 1

215 elif _is_incb_extend(prev_ucs):

216 i -= 1

217 elif _is_incb_consonant(prev_ucs):

218 if has_linker:

219 return BreakResult(should_break=False, ri_count=0)

220 break

221 else:

222 break

223

224 # GB11: ExtPict Extend* ZWJ x ExtPict

225 if prev_gcb == GCB.ZWJ and _is_extended_pictographic(curr_ucs):

226 i = curr_idx - 2 # Skip the ZWJ at curr_idx - 1

227 while i >= 0:

228 prev_ucs = ord(text[i])

229 prev_prop = _grapheme_cluster_break(prev_ucs)

230 if prev_prop == GCB.EXTEND:

231 i -= 1

232 elif _is_extended_pictographic(prev_ucs):

233 return BreakResult(should_break=False, ri_count=0)

234 else:

235 break

236

237 # GB12/GB13: RI x RI (pair matching)

238 if prev_gcb == GCB.REGIONAL_INDICATOR and curr_gcb == GCB.REGIONAL_INDICATOR:

239 if ri_count % 2 == 1:

240 return BreakResult(should_break=False, ri_count=ri_count + 1)

241 return BreakResult(should_break=True, ri_count=1)

242

243 # GB999: Any ÷ Any

244 ri_count = 1 if curr_gcb == GCB.REGIONAL_INDICATOR else 0

245 return BreakResult(should_break=True, ri_count=ri_count)

246

247

248def iter_graphemes(

249 unistr: str,

250 start: int = 0,

251 end: int | None = None,

252) -> Iterator[str]:

253 r"""

254 Iterate over grapheme clusters in a Unicode string.

255

256 Grapheme clusters are "user-perceived characters" - what a user would

257 consider a single character, which may consist of multiple Unicode

258 codepoints (e.g., a base character with combining marks, emoji sequences).

259

260 :param unistr: The Unicode string to segment.

261 :param start: Starting index (default 0).

262 :param end: Ending index (default len(unistr)).

263 :yields: Grapheme cluster substrings.

264

265 Example::

266

267 >>> list(iter_graphemes('cafe\u0301'))

268 ['c', 'a', 'f', 'e\u0301']

269 >>> list(iter_graphemes('\U0001F468\u200D\U0001F469\u200D\U0001F467'))

270 ['o', 'k', '\U0001F468\u200D\U0001F469\u200D\U0001F467']

271 >>> list(iter_graphemes('\U0001F1FA\U0001F1F8'))

272 ['o', 'k', '\U0001F1FA\U0001F1F8']

273

274 .. versionadded:: 0.3.0

275 """

276 if not unistr:

277 return

278

279 length = len(unistr)

280

281 if end is None:

282 end = length

283

284 if start >= end or start >= length:

285 return

286

287 end = min(end, length)

288

289 # Track state for grapheme cluster boundaries

290 cluster_start = start

291 ri_count = 0

292

293 # Get GCB for first character

294 prev_gcb = _grapheme_cluster_break(ord(unistr[start]))

295

296 # Handle Regional Indicator count initialization

297 if prev_gcb == GCB.REGIONAL_INDICATOR:

298 ri_count = 1

299

300 for idx in range(start + 1, end):

301 curr_gcb = _grapheme_cluster_break(ord(unistr[idx]))

302

303 result = _should_break(prev_gcb, curr_gcb, unistr, idx, ri_count)

304 ri_count = result.ri_count

305

306 if result.should_break:

307 yield unistr[cluster_start:idx]

308 cluster_start = idx

309

310 prev_gcb = curr_gcb

311

312 # Yield the final cluster

313 yield unistr[cluster_start:end]

314

315

316def _find_cluster_start(text: str, pos: int) -> int:

317 """

318 Find the start of the grapheme cluster containing the character before pos.

319

320 Scans backwards from pos to find a safe starting point, then iterates forward using standard

321 break rules to find the actual cluster boundary.

322

323 :param text: The Unicode string.

324 :param pos: Position to search before (exclusive).

325 :returns: Start position of the grapheme cluster.

326 """

327 target_cp = ord(text[pos - 1])

328

329 # GB3: CR x LF - LF after CR is part of same cluster

330 if target_cp == 0x0A and pos >= 2 and text[pos - 2] == '\r':

331 return pos - 2

332

333 # Fast path: ASCII (except LF) starts its own cluster

334 if target_cp < 0x80:

335 # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic)

336 if pos >= 2 and target_cp >= 0x20:

337 prev_cp = ord(text[pos - 2])

338 if prev_cp >= 0x80 and _grapheme_cluster_break(prev_cp) == GCB.PREPEND:

339 return _find_cluster_start(text, pos - 1)

340 return pos - 1

341

342 # Scan backward to find a safe starting point

343 safe_start = pos - 1

344 while safe_start > 0 and (pos - safe_start) < MAX_GRAPHEME_SCAN:

345 cp = ord(text[safe_start])

346 if 0x20 <= cp < 0x80: # ASCII always starts a cluster

347 break

348 if _grapheme_cluster_break(cp) == GCB.CONTROL: # GB4

349 break

350 safe_start -= 1

351

352 # Verify forward to find the actual cluster boundary

353 cluster_start = safe_start

354 left_gcb = _grapheme_cluster_break(ord(text[safe_start]))

355 ri_count = 1 if left_gcb == GCB.REGIONAL_INDICATOR else 0

356

357 for i in range(safe_start + 1, pos):

358 right_gcb = _grapheme_cluster_break(ord(text[i]))

359 result = _should_break(left_gcb, right_gcb, text, i, ri_count)

360 ri_count = result.ri_count

361 if result.should_break:

362 cluster_start = i

363 left_gcb = right_gcb

364

365 return cluster_start

366

367

368def grapheme_boundary_before(unistr: str, pos: int) -> int:

369 r"""

370 Find the grapheme cluster boundary immediately before a position.

371

372 :param unistr: The Unicode string to search.

373 :param pos: Position in the string (0 < pos <= len(unistr)).

374 :returns: Start index of the grapheme cluster containing the character at pos-1.

375

376 Example::

377

378 >>> grapheme_boundary_before('Hello \U0001F44B\U0001F3FB', 8)

379 6

380 >>> grapheme_boundary_before('a\r\nb', 3)

381 1

382

383 .. versionadded:: 0.3.6

384 """

385 if pos <= 0:

386 return 0

387 return _find_cluster_start(unistr, min(pos, len(unistr)))

388

389

390def iter_graphemes_reverse(

391 unistr: str,

392 start: int = 0,

393 end: int | None = None,

394) -> Iterator[str]:

395 r"""

396 Iterate over grapheme clusters in reverse order (last to first).

397

398 :param unistr: The Unicode string to segment.

399 :param start: Starting index (default 0).

400 :param end: Ending index (default len(unistr)).

401 :yields: Grapheme cluster substrings in reverse order.

402

403 Example::

404

405 >>> list(iter_graphemes_reverse('cafe\u0301'))

406 ['e\u0301', 'f', 'a', 'c']

407

408 .. versionadded:: 0.3.6

409 """

410 if not unistr:

411 return

412

413 length = len(unistr)

414

415 end = length if end is None else min(end, length)

416 start = max(start, 0)

417

418 if start >= end or start >= length:

419 return

420

421 pos = end

422 while pos > start:

423 cluster_start = _find_cluster_start(unistr, pos)

424 # Don't yield partial graphemes that extend before start

425 if cluster_start < start:

426 break

427 yield unistr[cluster_start:pos]

428 pos = cluster_start