Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/pipeline/language.py: 53%

1"""Three-tier language detection for filling DetectionResult languages.

3Tier 1: hardcoded mapping for single-language encodings (e.g. Big5 -> Chinese).

4Tier 2: statistical bigram scoring against the encoding's language-model variants.

5Tier 3: decode to UTF-8 and score against the UTF-8 byte-level language models.

7Note: ``from __future__ import annotations`` is intentionally omitted because

8this module is compiled with mypyc, which does not support PEP 563 string

9annotations.

10"""

12from chardet.models import (

13 BigramProfile,

14 has_model_variants,

15 infer_language,

16 score_best_language,

17)

18from chardet.pipeline import DetectionResult

20# Maximum bytes of data used for language scoring.

21# Language bigrams converge quickly — 2 KB is sufficient for discrimination

22# across all language models while keeping Tier 3 (language-model scoring) fast.

23_LANG_SCORE_MAX_BYTES = 2048

26def _to_utf8(data: bytes, encoding: str) -> bytes | None:

27 """Decode data from encoding and re-encode as UTF-8 for language scoring.

29 Returns None if the encoding is unknown. For UTF-8, returns data as-is.

30 Uses ``errors="ignore"`` because the data already passed byte-validity

31 filtering for the detected encoding; any residual invalid bytes are

32 irrelevant for language scoring.

33 """

34 if encoding == "utf-8":

35 return data

36 try:

37 return data.decode(encoding, errors="ignore").encode(

38 "utf-8", errors="surrogatepass"

39 )

40 except (LookupError, TypeError, ValueError):

41 return None

44def fill_languages(

45 data: bytes, results: list[DetectionResult]

46) -> list[DetectionResult]:

47 """Fill missing ``language`` fields on text results via the three-tier algorithm.

49 Tier 1: single-language encodings via hardcoded map (instant).

50 Tier 2: multi-language encodings via statistical bigram scoring (lazy).

51 Tier 3: decode to UTF-8, score against UTF-8 language models (universal fallback).

53 Binary results (``encoding is None``) are passed through unchanged, as are

54 results that already have a non-``None`` language.

56 :param data: The raw byte data the results were produced from. Truncated

57 to the first 2 KB internally — bigram language models converge quickly.

58 :param results: A list of :class:`DetectionResult` from the pipeline.

59 :returns: A list of results with ``language`` filled in where possible.

60 """

61 data = data[:_LANG_SCORE_MAX_BYTES]

62 filled: list[DetectionResult] = []

63 profile: BigramProfile | None = None

64 utf8_profile: BigramProfile | None = None

65 for result in results:

66 if result.language is not None or result.encoding is None:

67 filled.append(result)

68 continue

69 encoding = result.encoding

70 # Tier 1: single-language encoding

71 lang = infer_language(encoding)

72 # Tier 2: statistical scoring for multi-language encodings

73 if lang is None and data and has_model_variants(encoding):

74 if profile is None:

75 profile = BigramProfile(data)

76 _, lang = score_best_language(data, encoding, profile=profile)

77 # Tier 3: decode to UTF-8, score against UTF-8 language models

78 if lang is None and data and has_model_variants("utf-8"):

79 utf8_data = _to_utf8(data, encoding)

80 if utf8_data:

81 if utf8_profile is None or encoding != "utf-8":

82 utf8_profile = BigramProfile(utf8_data)

83 _, lang = score_best_language(utf8_data, "utf-8", profile=utf8_profile)

84 if lang is None:

85 filled.append(result)

86 else:

87 filled.append(

88 DetectionResult(encoding, result.confidence, lang, result.mime_type)

89 )

90 return filled