3636 # std imports
3737 from collections .abc import Iterator
3838
39+ # Maximum backward scan distance when finding grapheme cluster boundaries.
40+ # Covers all known Unicode grapheme clusters with margin; longer sequences are pathological.
41+ MAX_GRAPHEME_SCAN = 32
42+
3943
4044class GCB (IntEnum ):
4145 """Grapheme Cluster Break property values."""
@@ -317,51 +321,42 @@ def _find_cluster_start(text: str, pos: int) -> int:
317321 :param pos: Position to search before (exclusive).
318322 :returns: Start position of the grapheme cluster.
319323 """
320- # We're finding the cluster containing text[pos-1]
321- target_char = text [pos - 1 ]
322- target_cp = ord (target_char )
324+ target_cp = ord (text [pos - 1 ])
323325
324326 # GB3: CR x LF - LF after CR is part of same cluster
325- if target_char == ' \n ' and pos >= 2 and text [pos - 2 ] == '\r ' :
327+ if target_cp == 0x0A and pos >= 2 and text [pos - 2 ] == '\r ' :
326328 return pos - 2
327329
328330 # Fast path: ASCII (except LF) starts its own cluster
329331 if target_cp < 0x80 :
330332 # GB9b: Check for preceding PREPEND (rare: Arabic/Brahmic)
331333 if pos >= 2 and target_cp >= 0x20 :
332- preceding_cp = ord (text [pos - 2 ])
333- if preceding_cp >= 0x80 and _grapheme_cluster_break (preceding_cp ) == GCB .PREPEND :
334+ prev_cp = ord (text [pos - 2 ])
335+ if prev_cp >= 0x80 and _grapheme_cluster_break (prev_cp ) == GCB .PREPEND :
334336 return _find_cluster_start (text , pos - 1 )
335337 return pos - 1
336338
337- # Phase 1: Scan backward to find a safe starting point
339+ # Scan backward to find a safe starting point
338340 safe_start = pos - 1
339- max_scan = 32 # Bounded by max grapheme cluster complexity
340-
341- while safe_start > 0 and (pos - safe_start ) < max_scan :
341+ while safe_start > 0 and (pos - safe_start ) < MAX_GRAPHEME_SCAN :
342342 cp = ord (text [safe_start ])
343343 if 0x20 <= cp < 0x80 : # ASCII always starts a cluster
344344 break
345- if _grapheme_cluster_break (cp ) == GCB .CONTROL : # Control breaks after ( GB4)
345+ if _grapheme_cluster_break (cp ) == GCB .CONTROL : # GB4
346346 break
347347 safe_start -= 1
348348
349- # Phase 2: Verify forward to find the actual cluster boundary
349+ # Verify forward to find the actual cluster boundary
350350 cluster_start = safe_start
351- ri_count = 0
352-
353351 left_gcb = _grapheme_cluster_break (ord (text [safe_start ]))
354- if left_gcb == GCB .REGIONAL_INDICATOR :
355- ri_count = 1
352+ ri_count = 1 if left_gcb == GCB .REGIONAL_INDICATOR else 0
356353
357354 for i in range (safe_start + 1 , pos ):
358355 right_gcb = _grapheme_cluster_break (ord (text [i ]))
359356 result = _should_break (left_gcb , right_gcb , text , i , ri_count )
360357 ri_count = result .ri_count
361-
362358 if result .should_break :
363359 cluster_start = i
364-
365360 left_gcb = right_gcb
366361
367362 return cluster_start
@@ -386,10 +381,7 @@ def grapheme_boundary_before(unistr: str, pos: int) -> int:
386381 """
387382 if pos <= 0 :
388383 return 0
389- if pos > len (unistr ):
390- pos = len (unistr )
391-
392- return _find_cluster_start (unistr , pos )
384+ return _find_cluster_start (unistr , min (pos , len (unistr )))
393385
394386
395387def iter_graphemes_reverse (
@@ -417,14 +409,8 @@ def iter_graphemes_reverse(
417409
418410 length = len (unistr )
419411
420- if end is None :
421- end = length
422- else :
423- end = min (end , length )
424-
425- # Clamp start to valid range
426- if start < 0 :
427- start = 0
412+ end = length if end is None else min (end , length )
413+ start = max (start , 0 )
428414
429415 if start >= end or start >= length :
430416 return
0 commit comments