Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/enums.py: 100%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

41 statements  

1""" 

2All of the Enums that are used throughout the chardet package. 

3 

4:author: Dan Blanchard (dan.blanchard@gmail.com) 

5""" 

6 

7from enum import Flag, IntEnum, auto 

8 

9 

10class InputState(IntEnum): 

11 """ 

12 This enum represents the different states a universal detector can be in. 

13 """ 

14 

15 PURE_ASCII = 0 

16 ESC_ASCII = 1 

17 HIGH_BYTE = 2 

18 

19 

20class LanguageFilter(Flag): 

21 """ 

22 This enum represents the different language filters we can apply to a 

23 ``UniversalDetector``. 

24 """ 

25 

26 CHINESE_SIMPLIFIED = auto() 

27 CHINESE_TRADITIONAL = auto() 

28 JAPANESE = auto() 

29 KOREAN = auto() 

30 NON_CJK = auto() 

31 CHINESE = CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL 

32 CJK = CHINESE | JAPANESE | KOREAN 

33 ALL = NON_CJK | CJK 

34 

35 

36class ProbingState(IntEnum): 

37 """ 

38 This enum represents the different states a prober can be in. 

39 """ 

40 

41 DETECTING = 0 

42 FOUND_IT = 1 

43 NOT_ME = 2 

44 

45 

46class MachineState(IntEnum): 

47 """ 

48 This enum represents the different states a state machine can be in. 

49 """ 

50 

51 START = 0 

52 ERROR = 1 

53 ITS_ME = 2 

54 

55 

56class SequenceLikelihood(IntEnum): 

57 """ 

58 This enum represents the likelihood of a character following the previous one. 

59 """ 

60 

61 NEGATIVE = 0 

62 UNLIKELY = 1 

63 LIKELY = 2 

64 POSITIVE = 3 

65 

66 

67class CharacterCategory(IntEnum): 

68 """ 

69 This enum represents the different categories language models for 

70 ``SingleByteCharsetProber`` put characters into. 

71 

72 Anything less than DIGIT is considered a letter. 

73 """ 

74 

75 UNDEFINED = 255 

76 CONTROL = 254 

77 SYMBOL = 253 

78 LINE_BREAK = 252 

79 DIGIT = 251 

80 

81 

82class EncodingEra(Flag): 

83 """ 

84 This enum represents different eras of character encodings, used to filter 

85 which encodings are considered during detection. 

86 

87 The numeric values also serve as preference tiers for tie-breaking when 

88 confidence scores are very close. Lower values = more preferred/modern. 

89 """ 

90 

91 MODERN_WEB = auto() # UTF-8/16/32, Windows-125x, modern multibyte (widely used) 

92 LEGACY_ISO = auto() # ISO-8859-x (legacy but common) 

93 LEGACY_MAC = auto() # Mac encodings (less common) 

94 DOS = auto() # CP437, CP850, CP852, etc. (very legacy) 

95 MAINFRAME = auto() # EBCDIC variants (CP037, CP500, etc.) 

96 ALL = MODERN_WEB | LEGACY_ISO | LEGACY_MAC | DOS | MAINFRAME