/src/kcodecs/src/probers/nsLatin1Prober.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- C++ -*- |
2 | | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | | |
4 | | SPDX-License-Identifier: MIT |
5 | | */ |
6 | | |
7 | | #include "nsLatin1Prober.h" |
8 | | #include <stdio.h> |
9 | | #include <stdlib.h> |
10 | | |
11 | | #define UDF 0 // undefined |
12 | 3.13k | #define OTH 1 // other |
13 | | #define ASC 2 // ascii capital letter |
14 | | #define ASS 3 // ascii small letter |
15 | | #define ACV 4 // accent capital vowel |
16 | | #define ACO 5 // accent capital other |
17 | | #define ASV 6 // accent small vowel |
18 | | #define ASO 7 // accent small other |
19 | 41.3M | #define CLASS_NUM 8 // total classes |
20 | | |
21 | | namespace kencodingprober |
22 | | { |
23 | | static const unsigned char Latin1_CharToClass[] = { |
24 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 |
25 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F |
26 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 |
27 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F |
28 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 |
29 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F |
30 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 |
31 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F |
32 | | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 |
33 | | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F |
34 | | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 |
35 | | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F |
36 | | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 |
37 | | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F |
38 | | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 |
39 | | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F |
40 | | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 |
41 | | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F |
42 | | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 |
43 | | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F |
44 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 |
45 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF |
46 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 |
47 | | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF |
48 | | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 |
49 | | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF |
50 | | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 |
51 | | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF |
52 | | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 |
53 | | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF |
54 | | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 |
55 | | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF |
56 | | }; |
57 | | |
58 | | /* 0 : illegal |
59 | | 1 : very unlikely |
60 | | 2 : normal |
61 | | 3 : very likely |
62 | | */ |
63 | | static const unsigned char Latin1ClassModel[] = { |
64 | | /* UDF OTH ASC ASS ACV ACO ASV ASO */ |
65 | | /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, |
66 | | /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, |
67 | | /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, |
68 | | /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, |
69 | | /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, |
70 | | /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, |
71 | | /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, |
72 | | /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, |
73 | | }; |
74 | | |
75 | | void nsLatin1Prober::Reset(void) |
76 | 3.13k | { |
77 | 3.13k | mState = eDetecting; |
78 | 3.13k | mLastCharClass = OTH; |
79 | 15.6k | for (int i = 0; i < FREQ_CAT_NUM; i++) { |
80 | 12.5k | mFreqCounter[i] = 0; |
81 | 12.5k | } |
82 | 3.13k | } |
83 | | |
84 | | nsProbingState nsLatin1Prober::HandleData(const char *aBuf, unsigned int aLen) |
85 | 3.13k | { |
86 | 3.13k | char *newBuf1 = nullptr; |
87 | 3.13k | unsigned int newLen1 = 0; |
88 | | |
89 | 3.13k | if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { |
90 | 0 | newBuf1 = (char *)aBuf; |
91 | 0 | newLen1 = aLen; |
92 | 0 | } |
93 | | |
94 | 41.3M | for (unsigned int i = 0; i < newLen1; i++) { |
95 | 41.3M | const unsigned char charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; |
96 | 41.3M | const unsigned char freq = Latin1ClassModel[mLastCharClass * CLASS_NUM + charClass]; |
97 | 41.3M | if (freq == 0) { |
98 | 330 | mState = eNotMe; |
99 | 330 | break; |
100 | 330 | } |
101 | 41.3M | mFreqCounter[freq]++; |
102 | 41.3M | mLastCharClass = charClass; |
103 | 41.3M | } |
104 | | |
105 | 3.13k | if (newBuf1 != aBuf) { |
106 | 3.13k | free(newBuf1); |
107 | 3.13k | } |
108 | | |
109 | 3.13k | return mState; |
110 | 3.13k | } |
111 | | |
112 | | float nsLatin1Prober::GetConfidence(void) |
113 | 0 | { |
114 | 0 | if (mState == eNotMe) { |
115 | 0 | return 0.01f; |
116 | 0 | } |
117 | | |
118 | 0 | float confidence; |
119 | 0 | unsigned int total = 0; |
120 | 0 | for (int i = 0; i < FREQ_CAT_NUM; i++) { |
121 | 0 | total += mFreqCounter[i]; |
122 | 0 | } |
123 | |
|
124 | 0 | if (!total) { |
125 | 0 | confidence = 0.0f; |
126 | 0 | } else { |
127 | 0 | confidence = mFreqCounter[3] * 1.0f / total; |
128 | 0 | confidence -= mFreqCounter[1] * 20.0f / total; |
129 | 0 | } |
130 | |
|
131 | 0 | if (confidence < 0.0f) { |
132 | 0 | confidence = 0.0f; |
133 | 0 | } |
134 | | |
135 | | // lower the confidence of latin1 so that other more accurate detector |
136 | | // can take priority. |
137 | 0 | confidence *= 0.50f; |
138 | |
|
139 | 0 | return confidence; |
140 | 0 | } |
141 | | |
142 | | #ifdef DEBUG_PROBE |
143 | | void nsLatin1Prober::DumpStatus() |
144 | | { |
145 | | printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); |
146 | | } |
147 | | #endif |
148 | | } |