Coverage Report

Created: 2026-03-12 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/source/i18n/inputext.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 **********************************************************************
5
 *   Copyright (C) 2005-2016, International Business Machines
6
 *   Corporation and others.  All Rights Reserved.
7
 **********************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_CONVERSION
13
14
#include "inputext.h"
15
16
#include "cmemory.h"
17
#include "cstring.h"
18
19
#include <string.h>
20
21
U_NAMESPACE_BEGIN
22
23
218k
#define BUFFER_SIZE 8192
24
25
62.6k
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
26
94.0k
#define DELETE_ARRAY(array) uprv_free((void *) (array))
27
28
InputText::InputText(UErrorCode &status)
29
31.3k
    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
30
                                                 //   removed if appropriate.
31
31.3k
      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
32
                                                 //   Value is percent, not absolute.
33
31.3k
      fDeclaredEncoding(0),
34
31.3k
      fRawInput(0),
35
31.3k
      fRawLength(0)
36
31.3k
{
37
31.3k
    if (fInputBytes == NULL || fByteStats == NULL) {
38
0
        status = U_MEMORY_ALLOCATION_ERROR;
39
0
    }
40
31.3k
}
41
42
InputText::~InputText()
43
31.3k
{
44
31.3k
    DELETE_ARRAY(fDeclaredEncoding);
45
31.3k
    DELETE_ARRAY(fByteStats);
46
31.3k
    DELETE_ARRAY(fInputBytes);
47
31.3k
}
48
49
void InputText::setText(const char *in, int32_t len)
50
31.2k
{
51
31.2k
    fInputLen  = 0;
52
31.2k
    fC1Bytes   = FALSE;
53
31.2k
    fRawInput  = (const uint8_t *) in;
54
31.2k
    fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
55
31.2k
}
56
57
void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
58
0
{
59
0
    if(encoding) {
60
0
        if (len == -1) {
61
0
            len = (int32_t)uprv_strlen(encoding);
62
0
        }
63
64
0
        len += 1;     // to make place for the \0 at the end.
65
0
        uprv_free(fDeclaredEncoding);
66
0
        fDeclaredEncoding = NEW_ARRAY(char, len);
67
0
        uprv_strncpy(fDeclaredEncoding, encoding, len);
68
0
    }
69
0
}
70
71
UBool InputText::isSet() const 
72
31.2k
{
73
31.2k
    return fRawInput != NULL;
74
31.2k
}
75
76
/**
77
*  MungeInput - after getting a set of raw input data to be analyzed, preprocess
78
*               it by removing what appears to be html markup.
79
* 
80
* @internal
81
*/
82
31.2k
void InputText::MungeInput(UBool fStripTags) {
83
31.2k
    int     srci = 0;
84
31.2k
    int     dsti = 0;
85
31.2k
    uint8_t b;
86
31.2k
    bool    inMarkup = FALSE;
87
31.2k
    int32_t openTags = 0;
88
31.2k
    int32_t badTags  = 0;
89
90
    //
91
    //  html / xml markup stripping.
92
    //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
93
    //     discard everything within < brackets >
94
    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
95
    //     guess as to whether the input was actually marked up at all.
96
    // TODO: Think about how this interacts with EBCDIC charsets that are detected.
97
31.2k
    if (fStripTags) {
98
187k
        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
99
185k
            b = fRawInput[srci];
100
101
185k
            if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
102
0
                if (inMarkup) {
103
0
                    badTags += 1;
104
0
                }
105
106
0
                inMarkup = TRUE;
107
0
                openTags += 1;
108
0
            }
109
110
185k
            if (! inMarkup) {
111
185k
                fInputBytes[dsti++] = b;
112
185k
            }
113
114
185k
            if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
115
0
                inMarkup = FALSE;
116
0
            }
117
185k
        }
118
119
2.35k
        fInputLen = dsti;
120
2.35k
    }
121
122
    //
123
    //  If it looks like this input wasn't marked up, or if it looks like it's
124
    //    essentially nothing but markup abandon the markup stripping.
125
    //    Detection will have to work on the unstripped input.
126
    //
127
31.2k
    if (openTags<5 || openTags/5 < badTags || 
128
0
        (fInputLen < 100 && fRawLength>600))
129
31.2k
    {
130
31.2k
        int32_t limit = fRawLength;
131
132
31.2k
        if (limit > BUFFER_SIZE) {
133
1.47k
            limit = BUFFER_SIZE;
134
1.47k
        }
135
136
23.6M
        for (srci=0; srci<limit; srci++) {
137
23.6M
            fInputBytes[srci] = fRawInput[srci];
138
23.6M
        }
139
140
31.2k
        fInputLen = srci;
141
31.2k
    }
142
143
    //
144
    // Tally up the byte occurence statistics.
145
    // These are available for use by the various detectors.
146
    //
147
148
31.2k
    uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
149
150
23.6M
    for (srci = 0; srci < fInputLen; srci += 1) {
151
23.6M
        fByteStats[fInputBytes[srci]] += 1;
152
23.6M
    }
153
154
402k
    for (int32_t i = 0x80; i <= 0x9F; i += 1) {
155
394k
        if (fByteStats[i] != 0) {
156
23.4k
            fC1Bytes = TRUE;
157
23.4k
            break;
158
23.4k
        }
159
394k
    }
160
31.2k
}
161
162
U_NAMESPACE_END
163
#endif
164