1"""Three-tier language detection for filling DetectionResult languages.
2
3Tier 1: hardcoded mapping for single-language encodings (e.g. Big5 -> Chinese).
4Tier 2: statistical bigram scoring against the encoding's language-model variants.
5Tier 3: decode to UTF-8 and score against the UTF-8 byte-level language models.
6
7Note: ``from __future__ import annotations`` is intentionally omitted because
8this module is compiled with mypyc, which does not support PEP 563 string
9annotations.
10"""
11
12from chardet.models import (
13 BigramProfile,
14 has_model_variants,
15 infer_language,
16 score_best_language,
17)
18from chardet.pipeline import DetectionResult
19
20# Maximum bytes of data used for language scoring.
21# Language bigrams converge quickly — 2 KB is sufficient for discrimination
22# across all language models while keeping Tier 3 (language-model scoring) fast.
23_LANG_SCORE_MAX_BYTES = 2048
24
25
26def _to_utf8(data: bytes, encoding: str) -> bytes | None:
27 """Decode data from encoding and re-encode as UTF-8 for language scoring.
28
29 Returns None if the encoding is unknown. For UTF-8, returns data as-is.
30 Uses ``errors="ignore"`` because the data already passed byte-validity
31 filtering for the detected encoding; any residual invalid bytes are
32 irrelevant for language scoring.
33 """
34 if encoding == "utf-8":
35 return data
36 try:
37 return data.decode(encoding, errors="ignore").encode(
38 "utf-8", errors="surrogatepass"
39 )
40 except (LookupError, TypeError, ValueError):
41 return None
42
43
44def fill_languages(
45 data: bytes, results: list[DetectionResult]
46) -> list[DetectionResult]:
47 """Fill missing ``language`` fields on text results via the three-tier algorithm.
48
49 Tier 1: single-language encodings via hardcoded map (instant).
50 Tier 2: multi-language encodings via statistical bigram scoring (lazy).
51 Tier 3: decode to UTF-8, score against UTF-8 language models (universal fallback).
52
53 Binary results (``encoding is None``) are passed through unchanged, as are
54 results that already have a non-``None`` language.
55
56 :param data: The raw byte data the results were produced from. Truncated
57 to the first 2 KB internally — bigram language models converge quickly.
58 :param results: A list of :class:`DetectionResult` from the pipeline.
59 :returns: A list of results with ``language`` filled in where possible.
60 """
61 data = data[:_LANG_SCORE_MAX_BYTES]
62 filled: list[DetectionResult] = []
63 profile: BigramProfile | None = None
64 utf8_profile: BigramProfile | None = None
65 for result in results:
66 if result.language is not None or result.encoding is None:
67 filled.append(result)
68 continue
69 encoding = result.encoding
70 # Tier 1: single-language encoding
71 lang = infer_language(encoding)
72 # Tier 2: statistical scoring for multi-language encodings
73 if lang is None and data and has_model_variants(encoding):
74 if profile is None:
75 profile = BigramProfile(data)
76 _, lang = score_best_language(data, encoding, profile=profile)
77 # Tier 3: decode to UTF-8, score against UTF-8 language models
78 if lang is None and data and has_model_variants("utf-8"):
79 utf8_data = _to_utf8(data, encoding)
80 if utf8_data:
81 if utf8_profile is None or encoding != "utf-8":
82 utf8_profile = BigramProfile(utf8_data)
83 _, lang = score_best_language(utf8_data, "utf-8", profile=utf8_profile)
84 if lang is None:
85 filled.append(result)
86 else:
87 filled.append(
88 DetectionResult(encoding, result.confidence, lang, result.mime_type)
89 )
90 return filled