Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/language.py: 53%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

36 statements  

1"""Three-tier language detection for filling DetectionResult languages. 

2 

3Tier 1: hardcoded mapping for single-language encodings (e.g. Big5 -> Chinese). 

4Tier 2: statistical bigram scoring against the encoding's language-model variants. 

5Tier 3: decode to UTF-8 and score against the UTF-8 byte-level language models. 

6 

7Note: ``from __future__ import annotations`` is intentionally omitted because 

8this module is compiled with mypyc, which does not support PEP 563 string 

9annotations. 

10""" 

11 

12from chardet.models import ( 

13 BigramProfile, 

14 has_model_variants, 

15 infer_language, 

16 score_best_language, 

17) 

18from chardet.pipeline import DetectionResult 

19 

20# Maximum bytes of data used for language scoring. 

21# Language bigrams converge quickly — 2 KB is sufficient for discrimination 

22# across all language models while keeping Tier 3 (language-model scoring) fast. 

23_LANG_SCORE_MAX_BYTES = 2048 

24 

25 

26def _to_utf8(data: bytes, encoding: str) -> bytes | None: 

27 """Decode data from encoding and re-encode as UTF-8 for language scoring. 

28 

29 Returns None if the encoding is unknown. For UTF-8, returns data as-is. 

30 Uses ``errors="ignore"`` because the data already passed byte-validity 

31 filtering for the detected encoding; any residual invalid bytes are 

32 irrelevant for language scoring. 

33 """ 

34 if encoding == "utf-8": 

35 return data 

36 try: 

37 return data.decode(encoding, errors="ignore").encode( 

38 "utf-8", errors="surrogatepass" 

39 ) 

40 except (LookupError, TypeError, ValueError): 

41 return None 

42 

43 

44def fill_languages( 

45 data: bytes, results: list[DetectionResult] 

46) -> list[DetectionResult]: 

47 """Fill missing ``language`` fields on text results via the three-tier algorithm. 

48 

49 Tier 1: single-language encodings via hardcoded map (instant). 

50 Tier 2: multi-language encodings via statistical bigram scoring (lazy). 

51 Tier 3: decode to UTF-8, score against UTF-8 language models (universal fallback). 

52 

53 Binary results (``encoding is None``) are passed through unchanged, as are 

54 results that already have a non-``None`` language. 

55 

56 :param data: The raw byte data the results were produced from. Truncated 

57 to the first 2 KB internally — bigram language models converge quickly. 

58 :param results: A list of :class:`DetectionResult` from the pipeline. 

59 :returns: A list of results with ``language`` filled in where possible. 

60 """ 

61 data = data[:_LANG_SCORE_MAX_BYTES] 

62 filled: list[DetectionResult] = [] 

63 profile: BigramProfile | None = None 

64 utf8_profile: BigramProfile | None = None 

65 for result in results: 

66 if result.language is not None or result.encoding is None: 

67 filled.append(result) 

68 continue 

69 encoding = result.encoding 

70 # Tier 1: single-language encoding 

71 lang = infer_language(encoding) 

72 # Tier 2: statistical scoring for multi-language encodings 

73 if lang is None and data and has_model_variants(encoding): 

74 if profile is None: 

75 profile = BigramProfile(data) 

76 _, lang = score_best_language(data, encoding, profile=profile) 

77 # Tier 3: decode to UTF-8, score against UTF-8 language models 

78 if lang is None and data and has_model_variants("utf-8"): 

79 utf8_data = _to_utf8(data, encoding) 

80 if utf8_data: 

81 if utf8_profile is None or encoding != "utf-8": 

82 utf8_profile = BigramProfile(utf8_data) 

83 _, lang = score_best_language(utf8_data, "utf-8", profile=utf8_profile) 

84 if lang is None: 

85 filled.append(result) 

86 else: 

87 filled.append( 

88 DetectionResult(encoding, result.confidence, lang, result.mime_type) 

89 ) 

90 return filled